diff --git a/aiter/ops/triton/_triton_kernels/batched_gemm_a16wfp4.py b/aiter/ops/triton/_triton_kernels/batched_gemm_afp4wfp4_pre_quant.py similarity index 92% rename from aiter/ops/triton/_triton_kernels/batched_gemm_a16wfp4.py rename to aiter/ops/triton/_triton_kernels/batched_gemm_afp4wfp4_pre_quant.py index 1ffabd7fcc..86f7748acf 100755 --- a/aiter/ops/triton/_triton_kernels/batched_gemm_a16wfp4.py +++ b/aiter/ops/triton/_triton_kernels/batched_gemm_afp4wfp4_pre_quant.py @@ -14,8 +14,8 @@ from ..utils.core import AITER_TRITON_CONFIGS_PATH from .quant import _mxfp4_quant_op -_batched_gemm_a16wfp4_repr = make_kernel_repr( - "_batched_gemm_a16wfp4_kernel", +_batched_gemm_afp4_wfp4_pre_quant_repr = make_kernel_repr( + "_batched_gemm_afp4_wfp4_pre_quant_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", @@ -25,15 +25,13 @@ "SPLITK_BLOCK_SIZE", "EVEN_K", "GRID_MN", - "PRE_QUANT", - "HAVE_Y_SCALE", "cache_modifier", ], ) -_batched_gemm_a16wfp4_reduce_repr = make_kernel_repr( - "_batched_gemm_a16wfp4_reduce_kernel", +_batched_gemm_afp4_wfp4_pre_quant_reduce_repr = make_kernel_repr( + "_batched_gemm_afp4_wfp4_pre_quant_reduce_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", @@ -52,13 +50,12 @@ * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), } ) -@triton.jit(repr=_batched_gemm_a16wfp4_repr) -def _batched_gemm_a16wfp4_kernel( +@triton.jit(repr=_batched_gemm_afp4_wfp4_pre_quant_repr) +def _batched_gemm_afp4_wfp4_pre_quant_kernel( a_ptr, b_ptr, c_ptr, b_scales_ptr, - c_scale_ptr, M, N, K, @@ -84,8 +81,6 @@ def _batched_gemm_a16wfp4_kernel( SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, GRID_MN: tl.constexpr, - PRE_QUANT: tl.constexpr, - HAVE_Y_SCALE: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -126,12 +121,6 @@ def _batched_gemm_a16wfp4_kernel( stride_cb = tl.cast(stride_cb, tl.int64) pid_batch = tl.cast(pid_batch, tl.int64) - if HAVE_Y_SCALE: - c_scale = tl.load(c_scale_ptr) - else: - c_scale = 1 - c_scale_rcprl = (1 / c_scale).to(tl.float32) - if NUM_KSPLIT == 1: remap_xcd(pid, GRID_MN) @@ -200,8 +189,7 @@ def _batched_gemm_a16wfp4_kernel( b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 ) - if PRE_QUANT: # TODO add PRE_QUANT = False - a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) + a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") @@ -210,9 +198,6 @@ def _batched_gemm_a16wfp4_kernel( b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk - if HAVE_Y_SCALE: - accumulator = accumulator * c_scale_rcprl - c = accumulator.to(c_ptr.type.element_ty) # Write back the block of the output matrix C with masks. @@ -229,8 +214,8 @@ def _batched_gemm_a16wfp4_kernel( tl.store(c_ptrs, c, mask=c_mask) -@triton.jit(repr=_batched_gemm_a16wfp4_reduce_repr) -def _batched_gemm_a16wfp4_reduce_kernel( +@triton.jit(repr=_batched_gemm_afp4_wfp4_pre_quant_reduce_repr) +def _batched_gemm_afp4_wfp4_pre_quant_reduce_kernel( c_in_ptr, c_out_ptr, M, diff --git a/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_a16w16.py b/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_a16w16.py deleted file mode 100644 index f1a49d150f..0000000000 --- a/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_a16w16.py +++ /dev/null @@ -1,867 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -from typing import Optional -import functools -import json -import os -import torch -import triton -import triton.language as tl -from ..utils._triton.pid_preprocessing import pid_grid, remap_xcd -from ..utils._triton import arch_info -from ..utils.core import AITER_TRITON_CONFIGS_PATH -from ..utils._triton.kernel_repr import make_kernel_repr - - -_fused_gemm_afp4wfp4_a16w16_repr = make_kernel_repr( - "_fused_gemm_afp4wfp4_a16w16_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) - - -@triton.heuristics( - { - "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) - and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) - and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), - "GRID_MN_FP4": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) - * triton.cdiv(args["N_fp4"], args["BLOCK_SIZE_N"]), - "GRID_MN_BF16": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) - * triton.cdiv(args["N_bf16"], args["BLOCK_SIZE_N"]), - } -) -@triton.jit(repr=_fused_gemm_afp4wfp4_a16w16_repr) -def _fused_gemm_afp4wfp4_a16w16_kernel( - # Pointers to matrices - a_fp4_ptr, - b_fp4_ptr, - bias_fp4_ptr, - a_fp4_scale_ptr, - b_fp4_scale_ptr, - c_fp4_ptr, - a_bf16_ptr, - b_bf16_ptr, - bias_bf16_ptr, - c_bf16_ptr, - # Matrix dimensions - M, - N_fp4, - N_bf16, - K, - stride_a_fp4_m, - stride_a_fp4_k, - stride_b_fp4_k, - stride_b_fp4_n, - stride_a_fp4_scale_m, - stride_a_fp4_scale_k, - stride_b_fp4_scale_n, - stride_b_fp4_scale_k, - stride_c_fp4_k, - stride_c_fp4_m, - stride_c_fp4_n, - stride_a_bf16_m, - stride_a_bf16_k, - stride_b_bf16_k, - stride_b_bf16_n, - stride_c_bf16_k, - stride_c_bf16_m, - stride_c_bf16_n, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - NUM_KSPLIT: tl.constexpr, - SPLITK_BLOCK_SIZE: tl.constexpr, - ADD_BIAS_FP4: tl.constexpr, - ADD_BIAS_BF16: tl.constexpr, - EVEN_K: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, - GRID_MN_FP4: tl.constexpr, - GRID_MN_BF16: tl.constexpr, - SKIP_REDUCE: tl.constexpr, - cache_modifier: tl.constexpr, -): - - tl.assume(stride_a_fp4_m > 0) - tl.assume(stride_a_fp4_k > 0) - tl.assume(stride_b_fp4_k > 0) - tl.assume(stride_b_fp4_n > 0) - tl.assume(stride_c_fp4_k > 0) - tl.assume(stride_c_fp4_m > 0) - tl.assume(stride_c_fp4_n > 0) - tl.assume(stride_a_fp4_scale_m > 0) - tl.assume(stride_a_fp4_scale_k > 0) - tl.assume(stride_b_fp4_scale_k > 0) - tl.assume(stride_b_fp4_scale_n > 0) - - tl.assume(stride_a_bf16_m > 0) - tl.assume(stride_a_bf16_k > 0) - tl.assume(stride_b_bf16_k > 0) - tl.assume(stride_b_bf16_n > 0) - tl.assume(stride_c_bf16_m > 0) - tl.assume(stride_c_bf16_n > 0) - - SCALE_GROUP_SIZE: tl.constexpr = 32 - GRID_MN: tl.constexpr = GRID_MN_FP4 + GRID_MN_BF16 - - pid_unified = tl.program_id(axis=0) - pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) - - pid_k = pid_unified % NUM_KSPLIT - pid = pid_unified // NUM_KSPLIT - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n_fp4 = tl.cdiv(N_fp4, BLOCK_SIZE_N) - num_pid_n_bf16 = tl.cdiv(N_bf16, BLOCK_SIZE_N) - num_pid_n = num_pid_n_fp4 + num_pid_n_bf16 - - if NUM_KSPLIT == 1: - pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) - else: - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - - tl.assume(pid_m >= 0) - tl.assume(pid_n >= 0) - tl.assume(pid_k >= 0) - - if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: - - num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) - - offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - - if pid_n < num_pid_n_fp4: - offs_k_fp4 = tl.arange(0, BLOCK_SIZE_K // 2) - offs_k_fp4_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k_fp4 - offs_b_fp4_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_fp4 - a_fp4_ptrs = a_fp4_ptr + ( - offs_am[:, None] * stride_a_fp4_m - + offs_k_fp4_split[None, :] * stride_a_fp4_k - ) - b_fp4_ptrs = b_fp4_ptr + ( - offs_k_fp4_split[:, None] * stride_b_fp4_k - + offs_b_fp4_n[None, :] * stride_b_fp4_n - ) - - offs_k_fp4_scale = ( - pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) - ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - a_fp4_scale_ptrs = ( - a_fp4_scale_ptr - + offs_am[:, None] * stride_a_fp4_scale_m - + offs_k_fp4_scale[None, :] * stride_a_fp4_scale_k - ) - # B scales are N x K even though B operand is K x N. - b_fp4_scale_ptrs = ( - b_fp4_scale_ptr - + offs_b_fp4_n[:, None] * stride_b_fp4_scale_n - + offs_k_fp4_scale[None, :] * stride_b_fp4_scale_k - ) - - if ADD_BIAS_FP4: - if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): - accumulator_fp4 = tl.load(bias_fp4_ptr + offs_b_fp4_n).to( - dtype=tl.float32 - ) - accumulator_fp4 = tl.broadcast_to( - accumulator_fp4[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) - ) - else: - accumulator_fp4 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - else: - accumulator_fp4 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - - for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): - a_scale = tl.load(a_fp4_scale_ptrs) - b_scale = tl.load(b_fp4_scale_ptrs, cache_modifier=cache_modifier) - - if EVEN_K: - a = tl.load(a_fp4_ptrs) - b = tl.load(b_fp4_ptrs, cache_modifier=cache_modifier) - else: - a = tl.load( - a_fp4_ptrs, - mask=offs_k_fp4[None, :] < K - k * (BLOCK_SIZE_K // 2), - other=0.0, - ) - b = tl.load( - b_fp4_ptrs, - mask=offs_k_fp4[:, None] < K - k * (BLOCK_SIZE_K // 2), - other=0.0, - cache_modifier=cache_modifier, - ) - - accumulator_fp4 += tl.dot_scaled(a, a_scale, "e2m1", b, b_scale, "e2m1") - - a_fp4_ptrs += (BLOCK_SIZE_K // 2) * stride_a_fp4_k - b_fp4_ptrs += (BLOCK_SIZE_K // 2) * stride_b_fp4_k - a_fp4_scale_ptrs += ( - BLOCK_SIZE_K // SCALE_GROUP_SIZE - ) * stride_a_fp4_scale_k - b_fp4_scale_ptrs += ( - BLOCK_SIZE_K // SCALE_GROUP_SIZE - ) * stride_b_fp4_scale_k - - c_fp4 = accumulator_fp4.to(c_fp4_ptr.type.element_ty) - - offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( - tl.int64 - ) - offs_c_fp4_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( - 0, BLOCK_SIZE_N - ).to(tl.int64) - c_fp4_ptrs = ( - c_fp4_ptr - + stride_c_fp4_m * offs_cm[:, None] - + stride_c_fp4_n * offs_c_fp4_n[None, :] - + pid_k * stride_c_fp4_k - ) - c_fp4_mask = (offs_cm[:, None] < M) & (offs_c_fp4_n[None, :] < N_fp4) - tl.store(c_fp4_ptrs, c_fp4, mask=c_fp4_mask) - else: - pid_n -= num_pid_n_fp4 - offs_k_bf16 = tl.arange(0, BLOCK_SIZE_K) - offs_k_bf16_split = pid_k * SPLITK_BLOCK_SIZE + offs_k_bf16 - K = 2 * K - - offs_b_bf16_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_bf16 - a_ptrs = a_bf16_ptr + ( - offs_am[:, None] * stride_a_bf16_m - + offs_k_bf16_split[None, :] * stride_a_bf16_k - ) - b_ptrs = b_bf16_ptr + ( - offs_k_bf16_split[:, None] * stride_b_bf16_k - + offs_b_bf16_n[None, :] * stride_b_bf16_n - ) - - if ADD_BIAS_BF16: - if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): - accumulator_bf16 = tl.load(bias_bf16_ptr + offs_b_bf16_n).to( - dtype=tl.float32 - ) - accumulator_bf16 = tl.broadcast_to( - accumulator_bf16[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) - ) - else: - accumulator_bf16 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - else: - accumulator_bf16 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - - for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): - if EVEN_K: - a = tl.load(a_ptrs) - b = tl.load(b_ptrs, cache_modifier=cache_modifier) - else: - a = tl.load( - a_ptrs, - mask=offs_k_bf16[None, :] < K - k * BLOCK_SIZE_K, - other=0.0, - ) - b = tl.load( - b_ptrs, - mask=offs_k_bf16[:, None] < K - k * BLOCK_SIZE_K, - other=0.0, - cache_modifier=cache_modifier, - ) - - accumulator_bf16 += tl.dot(a, b, input_precision="ieee") - - a_ptrs += BLOCK_SIZE_K * stride_a_bf16_k - b_ptrs += BLOCK_SIZE_K * stride_b_bf16_k - - c_bf16 = accumulator_bf16.to(c_bf16_ptr.type.element_ty) - - # Write back the block of the output matrix C with masks. - offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( - tl.int64 - ) - offs_c_bf16_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( - 0, BLOCK_SIZE_N - ).to(tl.int64) - c_bf16_ptrs = ( - c_bf16_ptr - + stride_c_bf16_m * offs_cm[:, None] - + stride_c_bf16_n * offs_c_bf16_n[None, :] - + pid_k * stride_c_bf16_k - ) - c_bf16_mask = (offs_cm[:, None] < M) & (offs_c_bf16_n[None, :] < N_bf16) - tl.store(c_bf16_ptrs, c_bf16, mask=c_bf16_mask) - - -_fused_gemm_afp4wfp4_preshuffle_a16w16_repr = make_kernel_repr( - "_fused_gemm_afp4wfp4_preshuffle_a16w16_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) - - -@triton.heuristics( - { - "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) - and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) - and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), - "GRID_MN_FP4": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) - * triton.cdiv(args["N_fp4"], args["BLOCK_SIZE_N"]), - "GRID_MN_BF16": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) - * triton.cdiv(args["N_bf16"], args["BLOCK_SIZE_N"]), - } -) -@triton.jit(repr=_fused_gemm_afp4wfp4_preshuffle_a16w16_repr) -def _fused_gemm_afp4wfp4_preshuffle_a16w16_kernel( - # Pointers to matrices - a_fp4_ptr, - b_fp4_ptr, - bias_fp4_ptr, - a_fp4_scale_ptr, - b_fp4_scale_ptr, - c_fp4_ptr, - a_bf16_ptr, - b_bf16_ptr, - bias_bf16_ptr, - c_bf16_ptr, - # Matrix dimensions - M, - N_fp4, - N_bf16, - K, - stride_a_fp4_m, - stride_a_fp4_k, - stride_b_fp4_n, - stride_b_fp4_k, - stride_a_fp4_scale_m, - stride_a_fp4_scale_k, - stride_b_fp4_scale_n, - stride_b_fp4_scale_k, - stride_c_fp4_k, - stride_c_fp4_m, - stride_c_fp4_n, - stride_a_bf16_m, - stride_a_bf16_k, - stride_b_bf16_k, - stride_b_bf16_n, - stride_c_bf16_k, - stride_c_bf16_m, - stride_c_bf16_n, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - NUM_KSPLIT: tl.constexpr, - SPLITK_BLOCK_SIZE: tl.constexpr, - ADD_BIAS_FP4: tl.constexpr, - ADD_BIAS_BF16: tl.constexpr, - EVEN_K: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, - GRID_MN_FP4: tl.constexpr, - GRID_MN_BF16: tl.constexpr, - SKIP_REDUCE: tl.constexpr, - cache_modifier: tl.constexpr, -): - - tl.assume(stride_a_fp4_m > 0) - tl.assume(stride_a_fp4_k > 0) - tl.assume(stride_b_fp4_k > 0) - tl.assume(stride_b_fp4_n > 0) - tl.assume(stride_c_fp4_k > 0) - tl.assume(stride_c_fp4_m > 0) - tl.assume(stride_c_fp4_n > 0) - tl.assume(stride_a_fp4_scale_m > 0) - tl.assume(stride_a_fp4_scale_k > 0) - tl.assume(stride_b_fp4_scale_k > 0) - tl.assume(stride_b_fp4_scale_n > 0) - - tl.assume(stride_a_bf16_m > 0) - tl.assume(stride_a_bf16_k > 0) - tl.assume(stride_b_bf16_k > 0) - tl.assume(stride_b_bf16_n > 0) - tl.assume(stride_c_bf16_m > 0) - tl.assume(stride_c_bf16_n > 0) - - SCALE_GROUP_SIZE: tl.constexpr = 32 - GRID_MN: tl.constexpr = GRID_MN_FP4 + GRID_MN_BF16 - - pid_unified = tl.program_id(axis=0) - pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) - - pid_k = pid_unified % NUM_KSPLIT - pid = pid_unified // NUM_KSPLIT - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n_fp4 = tl.cdiv(N_fp4, BLOCK_SIZE_N) - num_pid_n_bf16 = tl.cdiv(N_bf16, BLOCK_SIZE_N) - num_pid_n = num_pid_n_fp4 + num_pid_n_bf16 - - if NUM_KSPLIT == 1: - pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) - else: - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - - tl.assume(pid_m >= 0) - tl.assume(pid_n >= 0) - tl.assume(pid_k >= 0) - - if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: - offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - - if pid_n < num_pid_n_fp4: - num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) - - offs_k_fp4 = tl.arange(0, BLOCK_SIZE_K // 2) - offs_k_fp4_shuffle_arr = tl.arange(0, (BLOCK_SIZE_K // 2) * 16) - offs_k_fp4_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k_fp4 - offs_k_fp4_shuffle = ( - pid_k * (SPLITK_BLOCK_SIZE // 2) * 16 + offs_k_fp4_shuffle_arr - ) - - offs_b_fp4_n = ( - pid_n * (BLOCK_SIZE_N // 16) + tl.arange(0, BLOCK_SIZE_N // 16) - ) % N_fp4 - a_fp4_ptrs = a_fp4_ptr + ( - offs_am[:, None] * stride_a_fp4_m - + offs_k_fp4_split[None, :] * stride_a_fp4_k - ) - b_fp4_ptrs = b_fp4_ptr + ( - offs_b_fp4_n[:, None] * stride_b_fp4_n - + offs_k_fp4_shuffle[None, :] * stride_b_fp4_k - ) - - offs_b_fp4_scale_n = ( - pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) - ) % N_fp4 - offs_k_fp4_scale = ( - pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) * 32 - ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE * 32) - b_fp4_scale_ptrs = ( - b_fp4_scale_ptr - + offs_b_fp4_scale_n[:, None] * stride_b_fp4_scale_n - + offs_k_fp4_scale[None, :] * stride_b_fp4_scale_k - ) - - if BLOCK_SIZE_M < 32: - offs_ks_non_shufl = ( - pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) - ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - a_fp4_scale_ptrs = ( - a_fp4_scale_ptr - + offs_am[:, None] * stride_a_fp4_scale_m - + offs_ks_non_shufl[None, :] * stride_a_fp4_scale_k - ) - else: - offs_a_fp4_scale_m = ( - pid_m * (BLOCK_SIZE_M // 32) + tl.arange(0, (BLOCK_SIZE_M // 32)) - ) % M - a_fp4_scale_ptrs = ( - a_fp4_scale_ptr - + offs_a_fp4_scale_m[:, None] * stride_a_fp4_scale_m - + offs_k_fp4_scale[None, :] * stride_a_fp4_scale_k - ) - - if ADD_BIAS_FP4: - offs_b_fp4_n_bias = ( - pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - ) % N_fp4 - if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): - accumulator_fp4 = tl.load(bias_fp4_ptr + offs_b_fp4_n_bias).to( - dtype=tl.float32 - ) - accumulator_fp4 = tl.broadcast_to( - accumulator_fp4[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) - ) - else: - accumulator_fp4 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - else: - accumulator_fp4 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - - for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): - if BLOCK_SIZE_M < 32: - a_scale = tl.load(a_fp4_scale_ptrs) - else: - a_scale = ( - tl.load(a_fp4_scale_ptrs) - .reshape( - BLOCK_SIZE_M // 32, - BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, - 4, - 16, - 2, - 2, - 1, - ) - .permute(0, 5, 3, 1, 4, 2, 6) - .reshape(BLOCK_SIZE_M, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - ) - - b_scale = ( - tl.load(b_fp4_scale_ptrs, cache_modifier=cache_modifier) - .reshape( - BLOCK_SIZE_N // 32, - BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, - 4, - 16, - 2, - 2, - 1, - ) - .permute(0, 5, 3, 1, 4, 2, 6) - .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - ) - - if EVEN_K: - a = tl.load(a_fp4_ptrs) - b = tl.load(b_fp4_ptrs, cache_modifier=cache_modifier) - # else: - # a = tl.load( - # a_fp4_ptrs, - # mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), - # other=0.0 - # ) - # b = tl.load( - # b_fp4_ptrs, - # mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), - # other=0.0, - # cache_modifier=cache_modifier, - # ) - - b = ( - b.reshape( - 1, - BLOCK_SIZE_N // 16, - BLOCK_SIZE_K // 64, - 2, - 16, - 16, - ) - .permute(0, 1, 4, 2, 3, 5) - .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // 2) - .trans(1, 0) - ) - - accumulator_fp4 += tl.dot_scaled(a, a_scale, "e2m1", b, b_scale, "e2m1") - - a_fp4_ptrs += (BLOCK_SIZE_K // 2) * stride_a_fp4_k - b_fp4_ptrs += (BLOCK_SIZE_K // 2) * 16 * stride_b_fp4_k - if BLOCK_SIZE_M < 32: - a_fp4_scale_ptrs += ( - BLOCK_SIZE_K // SCALE_GROUP_SIZE - ) * stride_a_fp4_scale_k - else: - a_fp4_scale_ptrs += BLOCK_SIZE_K * stride_a_fp4_scale_k - b_fp4_scale_ptrs += BLOCK_SIZE_K * stride_b_fp4_scale_k - - c_fp4 = accumulator_fp4.to(c_fp4_ptr.type.element_ty) - - offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( - tl.int64 - ) - offs_c_fp4_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( - 0, BLOCK_SIZE_N - ).to(tl.int64) - c_fp4_ptrs = ( - c_fp4_ptr - + stride_c_fp4_m * offs_cm[:, None] - + stride_c_fp4_n * offs_c_fp4_n[None, :] - + pid_k * stride_c_fp4_k - ) - c_fp4_mask = (offs_cm[:, None] < M) & (offs_c_fp4_n[None, :] < N_fp4) - tl.store(c_fp4_ptrs, c_fp4, mask=c_fp4_mask, cache_modifier=".wt") - else: - pid_n -= num_pid_n_fp4 - K = 2 * K - - num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE, BLOCK_SIZE_K) - - offs_k_bf16 = tl.arange(0, BLOCK_SIZE_K) - offs_k_bf16_split = pid_k * (SPLITK_BLOCK_SIZE) + offs_k_bf16 - offs_b_bf16_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_bf16 - a_ptrs = a_bf16_ptr + ( - offs_am[:, None] * stride_a_bf16_m - + offs_k_bf16_split[None, :] * stride_a_bf16_k - ) - b_ptrs = b_bf16_ptr + ( - offs_k_bf16_split[:, None] * stride_b_bf16_k - + offs_b_bf16_n[None, :] * stride_b_bf16_n - ) - - if ADD_BIAS_BF16: - if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): - accumulator_bf16 = tl.load(bias_bf16_ptr + offs_b_bf16_n).to( - dtype=tl.float32 - ) - accumulator_bf16 = tl.broadcast_to( - accumulator_bf16[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) - ) - else: - accumulator_bf16 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - else: - accumulator_bf16 = tl.zeros( - (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 - ) - - for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): - if EVEN_K: - a = tl.load(a_ptrs) - b = tl.load(b_ptrs, cache_modifier=cache_modifier) - else: - a = tl.load( - a_ptrs, - mask=offs_k_bf16[None, :] < K - k * BLOCK_SIZE_K, - other=0.0, - ) - b = tl.load( - b_ptrs, - mask=offs_k_bf16[:, None] < K - k * BLOCK_SIZE_K, - other=0.0, - cache_modifier=cache_modifier, - ) - - accumulator_bf16 += tl.dot(a, b, input_precision="ieee") - - a_ptrs += BLOCK_SIZE_K * stride_a_bf16_k - b_ptrs += BLOCK_SIZE_K * stride_b_bf16_k - - c_bf16 = accumulator_bf16.to(c_bf16_ptr.type.element_ty) - - # Write back the block of the output matrix C with masks. - offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( - tl.int64 - ) - offs_c_bf16_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( - 0, BLOCK_SIZE_N - ).to(tl.int64) - c_bf16_ptrs = ( - c_bf16_ptr - + stride_c_bf16_m * offs_cm[:, None] - + stride_c_bf16_n * offs_c_bf16_n[None, :] - + pid_k * stride_c_bf16_k - ) - c_bf16_mask = (offs_cm[:, None] < M) & (offs_c_bf16_n[None, :] < N_bf16) - tl.store(c_bf16_ptrs, c_bf16, mask=c_bf16_mask) - - -_gemm_afp4wfp4_a16w16_reduce_repr = make_kernel_repr( - "_fused_gemm_afp4wfp4_a16w16_reduce_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "ACTUAL_KSPLIT", - "MAX_KSPLIT", - "ADD_BIAS_FP4", - "ADD_BIAS_BF16", - ], -) - - -@triton.heuristics({}) # dummy heuristics to invoke kernel re-naming -@triton.jit(repr=_gemm_afp4wfp4_a16w16_reduce_repr) -def _fused_gemm_afp4wfp4_a16w16_reduce_kernel( - bias_fp4_ptr, - c_fp4_in_ptr, - c_fp4_out_ptr, - bias_bf16_ptr, - c_bf16_in_ptr, - c_bf16_out_ptr, - M, - N_fp4, - N_bf16, - stride_c_fp4_in_k, - stride_c_fp4_in_m, - stride_c_fp4_in_n, - stride_c_fp4_out_m, - stride_c_fp4_out_n, - stride_c_bf16_in_k, - stride_c_bf16_in_m, - stride_c_bf16_in_n, - stride_c_bf16_out_m, - stride_c_bf16_out_n, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - ACTUAL_KSPLIT: tl.constexpr, - MAX_KSPLIT: tl.constexpr, - ADD_BIAS_FP4: tl.constexpr, - ADD_BIAS_BF16: tl.constexpr, -): - - tl.assume(stride_c_fp4_in_k > 0) - tl.assume(stride_c_fp4_in_m > 0) - tl.assume(stride_c_fp4_in_n > 0) - tl.assume(stride_c_fp4_out_m > 0) - tl.assume(stride_c_fp4_out_n > 0) - - tl.assume(stride_c_bf16_in_k > 0) - tl.assume(stride_c_bf16_in_m > 0) - tl.assume(stride_c_bf16_in_n > 0) - tl.assume(stride_c_bf16_out_m > 0) - tl.assume(stride_c_bf16_out_n > 0) - - pid_m = tl.program_id(axis=0) - pid_n = tl.program_id(axis=1) - - tl.assume(pid_m >= 0) - tl.assume(pid_n >= 0) - - offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - - num_pid_n_fp4 = tl.cdiv(N_fp4, BLOCK_SIZE_N) - offs_k = tl.arange(0, MAX_KSPLIT) - acc_dtype = tl.float32 if c_fp4_in_ptr.type.element_ty != tl.int8 else tl.int32 - - if pid_n < num_pid_n_fp4: - offs_fp4_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_fp4 - c_fp4_in_ptrs = ( - c_fp4_in_ptr - + (offs_k[:, None, None] * stride_c_fp4_in_k) - + (offs_m[None, :, None] * stride_c_fp4_in_m) - + (offs_fp4_n[None, None, :] * stride_c_fp4_in_n) - ) - - if ACTUAL_KSPLIT == MAX_KSPLIT: - c_fp4 = tl.load(c_fp4_in_ptrs) - else: - c_fp4 = tl.load( - c_fp4_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT, other=0.0 - ) - c_fp4 = tl.sum(c_fp4, axis=0) - if ADD_BIAS_FP4: - bias_fp4 = tl.load(bias_fp4_ptr + offs_fp4_n).to(dtype=acc_dtype) - bias_fp4 = tl.broadcast_to(bias_fp4[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N)) - c_fp4 += bias_fp4 - - c_fp4 = c_fp4.to(c_fp4_out_ptr.type.element_ty) - - c_fp4_out_ptrs = ( - c_fp4_out_ptr - + (offs_m[:, None] * stride_c_fp4_out_m) - + (offs_fp4_n[None, :] * stride_c_fp4_out_n) - ) - - tl.store(c_fp4_out_ptrs, c_fp4) - else: - pid_n -= num_pid_n_fp4 - - offs_bf16_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_bf16 - c_bf16_in_ptrs = ( - c_bf16_in_ptr - + (offs_k[:, None, None] * stride_c_bf16_in_k) - + (offs_m[None, :, None] * stride_c_bf16_in_m) - + (offs_bf16_n[None, None, :] * stride_c_bf16_in_n) - ) - - if ACTUAL_KSPLIT == MAX_KSPLIT: - c_bf16 = tl.load(c_bf16_in_ptrs) - else: - c_bf16 = tl.load( - c_bf16_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT, other=0.0 - ) - c_bf16 = tl.sum(c_bf16, axis=0) - if ADD_BIAS_BF16: - bias_bf16 = tl.load(bias_bf16_ptr + offs_bf16_n).to(dtype=acc_dtype) - bias_bf16 = tl.broadcast_to( - bias_bf16[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) - ) - c_bf16 += bias_bf16 - - c_bf16 = c_bf16.to(c_bf16_out_ptr.type.element_ty) - - c_bf16_out_ptrs = ( - c_bf16_out_ptr - + (offs_m[:, None] * stride_c_bf16_out_m) - + (offs_bf16_n[None, :] * stride_c_bf16_out_n) - ) - c_bf16_mask = (offs_m[:, None] < M) & (offs_bf16_n[None, :] < N_bf16) - tl.store(c_bf16_out_ptrs, c_bf16, mask=c_bf16_mask) - - -@functools.lru_cache(maxsize=1024) -def _get_config( - M: int, - N_fp4: int, - N_bf16: int, - K: int, - shuffle: bool = False, -): - shuffle_filename_suffix = "" if not shuffle else "_PRESHUFFLED" - if not hasattr(_get_config, "_config_dict"): - dev = arch_info.get_device() - _get_config._config_dict = {} - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-FUSED-GEMM-AFP4WFP4{shuffle_filename_suffix}-A16W16.json" - with open(fpath, "r") as file: - config = json.load(file) - _get_config._config_dict["default"] = config - - key = f"{N_fp4}_{N_bf16}_{K}" - if key not in _get_config._config_dict.keys(): - dev = arch_info.get_device() - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-FUSED-GEMM-AFP4WFP4{shuffle_filename_suffix}-A16W16-N4={N_fp4}-N16={N_bf16}-K={2*K}.json" - if os.path.exists(fpath): - with open(fpath, "r") as file: - config = json.load(file) - _get_config._config_dict[key] = config - else: - key = "default" # fall back to default config - - if M < 16 and "small" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["small"] - elif M < 32 and "small_M16" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["small_M16"] - elif M <= 128: - BLK_M = triton.next_power_of_2(M) - if BLK_M == 32 and "medium_M32" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["medium_M32"] - elif BLK_M == 64 and "medium_M64" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["medium_M64"] - elif BLK_M == 128 and "medium_M128" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["medium_M128"] - elif M <= 256 and "large" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["large"] - else: - BLK_M = triton.next_power_of_2(M) - if f"xlarge_M{BLK_M}" in _get_config._config_dict[key]: - return _get_config._config_dict[key][f"xlarge_M{BLK_M}"] - elif "xlarge" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["xlarge"] - - return _get_config._config_dict[key]["any"] diff --git a/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_mul_add.py b/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_mul_add.py deleted file mode 100644 index 530616e509..0000000000 --- a/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_mul_add.py +++ /dev/null @@ -1,657 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -import functools -import json -import os -import triton -import triton.language as tl -from ..utils._triton.pid_preprocessing import pid_grid, remap_xcd -from ..utils._triton import arch_info -from ..utils.core import AITER_TRITON_CONFIGS_PATH -from ..utils._triton.kernel_repr import make_kernel_repr - - -_fused_gemm_afp4wfp4_mul_add_repr = make_kernel_repr( - "_fused_gemm_afp4wfp4_mul_add_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) - - -@triton.heuristics( - { - "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) - and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) - and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), - } -) -@triton.jit(repr=_fused_gemm_afp4wfp4_mul_add_repr) -def _fused_gemm_afp4wfp4_mul_add_kernel( - a_ptr, - b_ptr, - c_ptr, - a_scales_ptr, - b_scales_ptr, - c_a_ptr, - c_b_ptr, - M, - N, - K, - stride_am, - stride_ak, - stride_bk, - stride_bn, - stride_ck, - stride_cm, - stride_cn, - stride_asm, - stride_ask, - stride_bsn, - stride_bsk, - stride_cam, - stride_can, - stride_cbm, - stride_cbn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - NUM_KSPLIT: tl.constexpr, - SPLITK_BLOCK_SIZE: tl.constexpr, - EVEN_K: tl.constexpr, - IS_A_SCALAR: tl.constexpr, - IS_B_SCALAR: tl.constexpr, - IS_A_TENSOR: tl.constexpr, - IS_B_TENSOR: tl.constexpr, - FUSE_TYPE: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, - cache_modifier: tl.constexpr, -): - """ - Kernel for computing the matmul C = A x B. - A and B inputs are in the microscale fp4 (mxfp4) format. - A_scales and B_scales are in e8m0 format. - A has shape (M, K), B has shape (K, N) and C has shape (M, N) - """ - - tl.assume(stride_am > 0) - tl.assume(stride_ak > 0) - tl.assume(stride_bk > 0) - tl.assume(stride_bn > 0) - tl.assume(stride_cm > 0) - tl.assume(stride_cn > 0) - tl.assume(stride_asm > 0) - tl.assume(stride_ask > 0) - tl.assume(stride_bsk > 0) - tl.assume(stride_bsn > 0) - - GRID_MN = tl.cdiv(M, BLOCK_SIZE_M) * tl.cdiv(N, BLOCK_SIZE_N) - - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 data reuse. - pid_unified = tl.program_id(axis=0) - # remap so that XCDs get continous chunks of pids (of CHUNK_SIZE). - pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) - - pid_k = pid_unified % NUM_KSPLIT - pid = pid_unified // NUM_KSPLIT - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - - if NUM_KSPLIT == 1: - pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) - else: - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - - tl.assume(pid_m >= 0) - tl.assume(pid_n >= 0) - # We assume 32 elements along K share the same scale. - SCALE_GROUP_SIZE: tl.constexpr = 32 - - if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: - - num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) - - # Create pointers for first block of A and B input matrices - # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. - offs_k = tl.arange(0, BLOCK_SIZE_K // 2) - offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k - offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N - a_ptrs = a_ptr + ( - offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak - ) - b_ptrs = b_ptr + ( - offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn - ) - # Create pointers for the first block of A and B scales - offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + tl.arange( - 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE - ) - a_scale_ptrs = ( - a_scales_ptr + offs_am[:, None] * stride_asm + offs_ks[None, :] * stride_ask - ) - # B scales are N x K even though B operand is K x N. - b_scale_ptrs = ( - b_scales_ptr + offs_bn[:, None] * stride_bsn + offs_ks[None, :] * stride_bsk - ) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): - a_scales = tl.load(a_scale_ptrs) - b_scales = tl.load(b_scale_ptrs, cache_modifier=cache_modifier) - - # Load the next block of A and B, generate a mask by checking the K dimension. - # If it is out of bounds, set it to 0. - if EVEN_K: - a = tl.load(a_ptrs) - b = tl.load(b_ptrs, cache_modifier=cache_modifier) - else: - a = tl.load( - a_ptrs, mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), other=0 - ) - b = tl.load( - b_ptrs, - mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), - other=0, - cache_modifier=cache_modifier, - ) - - accumulator = tl.dot_scaled( - a, a_scales, "e2m1", b, b_scales, "e2m1", accumulator - ) - - # Advance the ptrs to the next K block. - a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak - b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk - a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask - b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk - - # Write back the block of the output matrix C with masks. - offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) - offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - - if NUM_KSPLIT == 1: - if IS_A_SCALAR and IS_A_TENSOR: - c_a = tl.load(c_a_ptr) - elif IS_A_SCALAR: - c_a = c_a_ptr - else: - c_a = tl.load( - c_a_ptr - + stride_cam * offs_cm[:, None] - + stride_can * offs_cn[None, :], - mask=c_mask, - ) - c_a = c_a.to(tl.float32) - - if IS_B_SCALAR and IS_B_TENSOR: - c_b = tl.load(c_b_ptr) - elif IS_B_SCALAR: - c_b = c_b_ptr - else: - c_b = tl.load( - c_b_ptr - + stride_cbm * offs_cm[:, None] - + stride_cbn * offs_cn[None, :], - mask=c_mask, - ) - c_b = c_b.to(tl.float32) - - if FUSE_TYPE == 0: - accumulator = c_a * accumulator + c_b - else: - accumulator = c_b * c_a + accumulator - - c = accumulator.to(c_ptr.type.element_ty) - - c_ptrs = ( - c_ptr - + stride_cm * offs_cm[:, None] - + stride_cn * offs_cn[None, :] - + pid_k * stride_ck - ) - tl.store(c_ptrs, c, mask=c_mask) - - -_fused_gemm_afp4wfp4_preshuffle_mul_add_repr = make_kernel_repr( - "_fused_gemm_afp4wfp4_preshuffle_mul_add_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) - - -@triton.heuristics( - { - "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) - and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) - and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), - } -) -@triton.jit(repr=_fused_gemm_afp4wfp4_preshuffle_mul_add_repr) -def _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel( - a_ptr, - b_ptr, - c_ptr, - a_scales_ptr, - b_scales_ptr, - c_a_ptr, - c_b_ptr, - M, - N, - K, - stride_am, - stride_ak, - stride_bn, - stride_bk, - stride_ck, - stride_cm, - stride_cn, - stride_asm, - stride_ask, - stride_bsn, - stride_bsk, - stride_cam, - stride_can, - stride_cbm, - stride_cbn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - NUM_KSPLIT: tl.constexpr, - SPLITK_BLOCK_SIZE: tl.constexpr, - EVEN_K: tl.constexpr, - IS_A_SCALAR: tl.constexpr, - IS_B_SCALAR: tl.constexpr, - IS_A_TENSOR: tl.constexpr, - IS_B_TENSOR: tl.constexpr, - FUSE_TYPE: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, - cache_modifier: tl.constexpr, -): - """ - Kernel for computing the matmul C = A x B. - A and B inputs are in the microscale fp4 (mxfp4) format. - A_scales and B_scales are in e8m0 format. - A has shape (M, K), B has shape (K, N) and C has shape (M, N) - """ - - tl.assume(stride_am > 0) - tl.assume(stride_ak > 0) - tl.assume(stride_bk > 0) - tl.assume(stride_bn > 0) - tl.assume(stride_cm > 0) - tl.assume(stride_cn > 0) - tl.assume(stride_asm > 0) - tl.assume(stride_ask > 0) - tl.assume(stride_bsk > 0) - tl.assume(stride_bsn > 0) - - GRID_MN = tl.cdiv(M, BLOCK_SIZE_M) * tl.cdiv(N, BLOCK_SIZE_N) - - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 data reuse. - pid_unified = tl.program_id(axis=0) - pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) - pid_k = pid_unified % NUM_KSPLIT - pid = pid_unified // NUM_KSPLIT - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - - if NUM_KSPLIT == 1: - pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) - else: - pid_m = pid // num_pid_n - pid_n = pid % num_pid_n - - tl.assume(pid_m >= 0) - tl.assume(pid_n >= 0) - # We assume 32 elements along K share the same scale. - SCALE_GROUP_SIZE: tl.constexpr = 32 - - if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: - - num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) - - # Create pointers for first block of A and B input matrices - # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. - offs_k = tl.arange(0, BLOCK_SIZE_K // 2) - offs_k_shuffle_arr = tl.arange(0, (BLOCK_SIZE_K // 2) * 16) - offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k - offs_k_shuffle = pid_k * (SPLITK_BLOCK_SIZE // 2) * 16 + offs_k_shuffle_arr - - offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - offs_bn = (pid_n * (BLOCK_SIZE_N // 16) + tl.arange(0, BLOCK_SIZE_N // 16)) % N - a_ptrs = a_ptr + ( - offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak - ) - b_ptrs = b_ptr + ( - offs_bn[:, None] * stride_bn + offs_k_shuffle[None, :] * stride_bk - ) - - # Create pointers for the first block of A and B scales - offs_asn = ( - pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) - ) % N - offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) * 32) + tl.arange( - 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE * 32 - ) - # B scales are N x K even though B operand is K x N. - b_scale_ptrs = ( - b_scales_ptr - + offs_asn[:, None] * stride_bsn - + offs_ks[None, :] * stride_bsk - ) - - if BLOCK_SIZE_M < 32: - offs_ks_non_shufl = ( - pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) - ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - a_scale_ptrs = ( - a_scales_ptr - + offs_am[:, None] * stride_asm - + offs_ks_non_shufl[None, :] * stride_ask - ) - else: - offs_asm = ( - pid_m * (BLOCK_SIZE_M // 32) + tl.arange(0, (BLOCK_SIZE_M // 32)) - ) % M - a_scale_ptrs = ( - a_scales_ptr - + offs_asm[:, None] * stride_asm - + offs_ks[None, :] * stride_ask - ) - - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): - if BLOCK_SIZE_M < 32: - a_scales = tl.load(a_scale_ptrs) - else: - a_scales = ( - tl.load(a_scale_ptrs) - .reshape( - BLOCK_SIZE_M // 32, - BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, - 4, - 16, - 2, - 2, - 1, - ) - .permute(0, 5, 3, 1, 4, 2, 6) - .reshape(BLOCK_SIZE_M, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - ) - - b_scales = ( - tl.load(b_scale_ptrs, cache_modifier=cache_modifier) - .reshape( - BLOCK_SIZE_N // 32, - BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, - 4, - 16, - 2, - 2, - 1, - ) - .permute(0, 5, 3, 1, 4, 2, 6) - .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // SCALE_GROUP_SIZE) - ) - - # Load the next block of A and B, generate a mask by checking the K dimension. - # If it is out of bounds, set it to 0. - if EVEN_K: - a = tl.load(a_ptrs) - b = tl.load(b_ptrs, cache_modifier=cache_modifier) - - b = ( - b.reshape( - 1, - BLOCK_SIZE_N // 16, - BLOCK_SIZE_K // 64, - 2, - 16, - 16, - ) - .permute(0, 1, 4, 2, 3, 5) - .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // 2) - .trans(1, 0) - ) - - accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") - - # Advance the ptrs to the next K block. - a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak - b_ptrs += (BLOCK_SIZE_K // 2) * 16 * stride_bk - if BLOCK_SIZE_M < 32: - a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask - else: - a_scale_ptrs += BLOCK_SIZE_K * stride_ask - b_scale_ptrs += BLOCK_SIZE_K * stride_bsk - - # Write back the block of the output matrix C with masks. - offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) - offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - - if NUM_KSPLIT == 1: - if IS_A_SCALAR and IS_A_TENSOR: - c_a = tl.load(c_a_ptr) - elif IS_A_SCALAR: - c_a = c_a_ptr - else: - c_a = tl.load( - c_a_ptr - + stride_cam * offs_cm[:, None] - + stride_can * offs_cn[None, :], - mask=c_mask, - ) - c_a = c_a.to(tl.float32) - - if IS_B_SCALAR and IS_B_TENSOR: - c_b = tl.load(c_b_ptr) - elif IS_B_SCALAR: - c_b = c_b_ptr - else: - c_b = tl.load( - c_b_ptr - + stride_cbm * offs_cm[:, None] - + stride_cbn * offs_cn[None, :], - mask=c_mask, - ) - c_b = c_b.to(tl.float32) - - if FUSE_TYPE == 0: - accumulator = c_a * accumulator + c_b - else: - accumulator = c_b * c_a + accumulator - - c = accumulator.to(c_ptr.type.element_ty) - - c_ptrs = ( - c_ptr - + stride_cm * offs_cm[:, None] - + stride_cn * offs_cn[None, :] - + pid_k * stride_ck - ) - tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") - - -_fused_gemm_afp4wfp4_mul_add_reduce_repr = make_kernel_repr( - "_fused_gemm_afp4wfp4_mul_add_reduce_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "ACTUAL_KSPLIT", - "MAX_KSPLIT", - ], -) - - -@triton.heuristics({}) # dummy heuristics to invoke kernel re-naming -@triton.jit(repr=_fused_gemm_afp4wfp4_mul_add_reduce_repr) -def _fused_gemm_afp4wfp4_mul_add_reduce_kernel( - c_in_ptr, - c_out_ptr, - c_a_ptr, - c_b_ptr, - M, - N, - stride_c_in_k, - stride_c_in_m, - stride_c_in_n, - stride_c_out_m, - stride_c_out_n, - stride_cam, - stride_can, - stride_cbm, - stride_cbn, - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - ACTUAL_KSPLIT: tl.constexpr, - MAX_KSPLIT: tl.constexpr, - IS_A_SCALAR: tl.constexpr, - IS_B_SCALAR: tl.constexpr, - IS_A_TENSOR: tl.constexpr, - IS_B_TENSOR: tl.constexpr, - FUSE_TYPE: tl.constexpr, -): - - pid_m = tl.program_id(axis=0) - pid_n = tl.program_id(axis=1) - - offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N - offs_k = tl.arange(0, MAX_KSPLIT) - c_in_ptrs = ( - c_in_ptr - + (offs_k[:, None, None] * stride_c_in_k) - + (offs_m[None, :, None] * stride_c_in_m) - + (offs_n[None, None, :] * stride_c_in_n) - ) - - if ACTUAL_KSPLIT == MAX_KSPLIT: - c = tl.load(c_in_ptrs) - else: - c = tl.load(c_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT) - c = tl.sum(c, axis=0) - - if IS_A_SCALAR and IS_A_TENSOR: - c_a = tl.load(c_a_ptr) - elif IS_A_SCALAR: - c_a = c_a_ptr - else: - c_a = tl.load( - c_a_ptr + stride_cam * offs_m[:, None] + stride_can * offs_n[None, :] - ) - c_a = c_a.to(tl.float32) - - if IS_B_SCALAR and IS_B_TENSOR: - c_b = tl.load(c_b_ptr) - elif IS_B_SCALAR: - c_b = c_b_ptr - else: - c_b = tl.load( - c_b_ptr + stride_cbm * offs_m[:, None] + stride_cbn * offs_n[None, :] - ) - c_b = c_b.to(tl.float32) - - if FUSE_TYPE == 0: - c = c_a * c + c_b - else: - c = c_b * c_a + c - c = c.to(c_out_ptr.type.element_ty) - - c_out_ptrs = ( - c_out_ptr - + (offs_m[:, None] * stride_c_out_m) - + (offs_n[None, :] * stride_c_out_n) - ) - - tl.store(c_out_ptrs, c) - - -@functools.lru_cache(maxsize=1024) -def _get_config( - M: int, - N: int, - K: int, - shuffle: bool = False, -): - shuffle_filename_suffix = "" if not shuffle else "_PRESHUFFLED" - if not hasattr(_get_config, "_config_dict") or not hasattr( - _get_config._config_dict, f"default{shuffle_filename_suffix}" - ): - dev = arch_info.get_device() - _get_config._config_dict = {} - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-AFP4WFP4{shuffle_filename_suffix}.json" - with open(fpath, "r") as file: - config = json.load(file) - _get_config._config_dict[f"default{shuffle_filename_suffix}"] = config - - key = f"{N}_{K}{shuffle_filename_suffix}" - if key not in _get_config._config_dict.keys(): - dev = arch_info.get_device() - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-AFP4WFP4{shuffle_filename_suffix}-N={N}-K={2*K}.json" - if os.path.exists(fpath): - with open(fpath, "r") as file: - config = json.load(file) - _get_config._config_dict[key] = config - else: - key = f"default{shuffle_filename_suffix}" # fall back to default config - - if M < 32: - BLK_M = triton.next_power_of_2(M) - if BLK_M >= 16 and "small_M16" in _get_config._config_dict[key]: - return _get_config._config_dict[key]["small_M16"] - return _get_config._config_dict[key]["small"] - elif M <= 128: - BLK_M = triton.next_power_of_2(M) - if BLK_M == 32: - return _get_config._config_dict[key]["medium_M32"] - elif BLK_M == 64: - return _get_config._config_dict[key]["medium_M64"] - elif BLK_M == 128: - return _get_config._config_dict[key]["medium_M128"] - elif M <= 256: - return _get_config._config_dict[key]["large"] - else: - BLK_M = triton.next_power_of_2(M) - if f"xlarge_M{BLK_M}" in _get_config._config_dict[key]: - return _get_config._config_dict[key][f"xlarge_M{BLK_M}"] - return _get_config._config_dict[key]["xlarge"] diff --git a/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py b/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py index a4c79043ab..d17ad95af0 100644 --- a/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py +++ b/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py @@ -33,7 +33,6 @@ def _fused_rms_mxfp4_quant_kernel( out1_bs_ptr, out2_ptr, out_res1_ptr, - out1_ptr, eps1, eps2, M, @@ -47,14 +46,12 @@ def _fused_rms_mxfp4_quant_kernel( out1_bs_stride_n, out2_stride_m, out_res1_stride_m, - out1_stride_m, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_N2: tl.constexpr, MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, HAS_SECOND_INPUT: tl.constexpr, FIRST_INPUT_RES: tl.constexpr, - FIRST_INPUT_OUT: tl.constexpr, SCALE_N: tl.constexpr, SCALE_M_PAD: tl.constexpr, SCALE_N_PAD: tl.constexpr, @@ -142,14 +139,6 @@ def _fused_rms_mxfp4_quant_kernel( w1 = tl.load(w1_ptr + x_offs_n, mask=w_mask1, other=w_other1).to(tl.float32) norm1 = _rmsmorm_op(x1, w1, N1, eps1) - - if FIRST_INPUT_OUT: - tl.store( - out1_ptr + x_offs_m[:, None] * out1_stride_m + x_offs_n[None, :], - norm1, - mask=mask1, - ) - out1_fp4, bs_e8m0 = _mxfp4_quant_op( norm1, BLOCK_SIZE_N, BLOCK_SIZE_M, MXFP4_QUANT_BLOCK_SIZE ) @@ -264,217 +253,3 @@ def _fused_flatten_mxfp4_quant( out_block_scales, mask=block_scale_offs < tl.cdiv(N2, MXFP4_QUANT_BLOCK_SIZE), ) - - -@triton.heuristics( - { - "EVEN_M_N": lambda args: args["M"] % args["BLOCK_SIZE_M1"] == 0 - and args["N1"] % (args["BLOCK_SIZE_N1"] * args["NUM_ITER"]) == 0, - } -) -@triton.jit -def _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel( - x_ptr, - y_ptr, - y_scale_ptr, - x2_ptr, - y2_ptr, - stride_x_spk, - stride_x_m, - stride_x_n, - stride_y_m, - stride_y_n, - stride_y_scale_m, - stride_y_scale_n, - stride_x2_spk, - stride_x2_m, - stride_x2_n, - stride_y2_m, - stride_y2_n, - M, - N1, - N2, - BLOCK_SIZE_M1: tl.constexpr, - BLOCK_SIZE_N1: tl.constexpr, - BLOCK_SIZE_M2: tl.constexpr, - BLOCK_SIZE_N2: tl.constexpr, - NUM_ITER: tl.constexpr, - NUM_STAGES: tl.constexpr, - MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, - EVEN_M_N: tl.constexpr, - SCALING_MODE: tl.constexpr, - ACTIVATION: tl.constexpr, - scaleN: tl.constexpr, - scaleM_pad: tl.constexpr, - scaleN_pad: tl.constexpr, - SHUFFLE: tl.constexpr, - X_HAS_SPLITK: tl.constexpr, - X_NUM_KSPLIT: tl.constexpr, - X_NUM_KSPLIT_POW2: tl.constexpr, -): - - tl.assume(stride_x_spk > 0) - tl.assume(stride_x_m > 0) - tl.assume(stride_x_n > 0) - tl.assume(stride_y_m > 0) - tl.assume(stride_y_n > 0) - tl.assume(stride_y_scale_m > 0) - tl.assume(stride_y_scale_n > 0) - tl.assume(stride_x2_spk > 0) - tl.assume(stride_x2_m > 0) - tl.assume(stride_x2_n > 0) - tl.assume(stride_y2_m > 0) - tl.assume(stride_y2_n > 0) - - all_pid = tl.program_id(axis=0) - num_pid_m1 = tl.cdiv(M, BLOCK_SIZE_M1) - num_pid_n1 = tl.cdiv(N1, BLOCK_SIZE_N1 * NUM_ITER) - num_pid_1 = num_pid_m1 * num_pid_n1 - - if X_HAS_SPLITK and all_pid >= num_pid_1: - pid2 = all_pid - num_pid_1 - num_pid_n2 = tl.cdiv(N2, BLOCK_SIZE_N2) - pid_m2 = pid2 // num_pid_n2 - pid_n2 = pid2 % num_pid_n2 - offs_m2 = (pid_m2 * BLOCK_SIZE_M2 + tl.arange(0, BLOCK_SIZE_M2)) % M - offs_n2 = (pid_n2 * BLOCK_SIZE_N2 + tl.arange(0, BLOCK_SIZE_N2)) % N2 - offs_spk = tl.arange(0, X_NUM_KSPLIT_POW2) - x2_ptrs = ( - x2_ptr - + offs_spk[:, None, None] * stride_x2_spk - + offs_m2[None, :, None] * stride_x2_m - + offs_n2[None, None, :] * stride_x2_n - ) - if X_NUM_KSPLIT_POW2 == X_NUM_KSPLIT: - x2 = tl.load(x2_ptrs) - else: - x2 = tl.load( - x2_ptrs, mask=offs_spk[:, None, None] < X_NUM_KSPLIT, other=0.0 - ) - x2 = tl.sum(x2, axis=0) - - x2 = x2.to(y2_ptr.type.element_ty) - - y2_out_ptrs = ( - y2_ptr + (offs_m2[:, None] * stride_y2_m) + (offs_n2[None, :] * stride_y2_n) - ) - - tl.store(y2_out_ptrs, x2) - return - - pid_m = all_pid // num_pid_n1 - start_n = all_pid % num_pid_n1 * NUM_ITER - NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N1 // MXFP4_QUANT_BLOCK_SIZE - - offs_spk = None - if X_HAS_SPLITK: - offs_spk = tl.arange(0, X_NUM_KSPLIT_POW2) - - for pid_n in tl.range(start_n, min(start_n + NUM_ITER, N1), num_stages=NUM_STAGES): - x_offs_m = pid_m * BLOCK_SIZE_M1 + tl.arange(0, BLOCK_SIZE_M1) - x_offs_n = pid_n * BLOCK_SIZE_N1 + tl.arange(0, BLOCK_SIZE_N1) - - mask = None - other = None - if X_HAS_SPLITK: - x_ptrs = ( - x_ptr - + offs_spk[:, None, None] * stride_x_spk - + x_offs_m[None, :, None] * stride_x_m - + x_offs_n[None, None, :] * stride_x_n - ) - if X_NUM_KSPLIT_POW2 != X_NUM_KSPLIT and not EVEN_M_N: - mask = ( - (offs_spk[:, None, None] < X_NUM_KSPLIT) - & (x_offs_m[None, :, None] < M) - & (x_offs_n[None, None, :] < N1) - ) - other = 0.0 - elif not (X_NUM_KSPLIT_POW2 == X_NUM_KSPLIT): - mask = offs_spk[:, None, None] < X_NUM_KSPLIT - other = 0.0 - elif not EVEN_M_N: - mask = (x_offs_m[None, :, None] < M) & (x_offs_n[None, None, :] < N1) - other = 0.0 - else: - x_ptrs = ( - x_ptr + x_offs_m[:, None] * stride_x_m + x_offs_n[None, :] * stride_x_n - ) - if not EVEN_M_N: - mask = (x_offs_m[:, None] < M) & (x_offs_n[None, :] < N1) - other = 0.0 - - x = tl.load( - x_ptrs, - mask=mask, - other=other, - cache_modifier=".cg", - ).to(tl.float32) - x_mul = tl.load( - x_ptrs + N1 * stride_x_n, - mask=mask, - other=other, - cache_modifier=".cg", - ).to(tl.float32) - - if X_HAS_SPLITK: - x = tl.sum(x, axis=0) - x_mul = tl.sum(x_mul, axis=0) - - # x = _apply_activation_from_str(a, ACTIVATION) * b - x = ACTIVATION(x) * x_mul - - y, y_scale = _mxfp4_quant_op( - x, BLOCK_SIZE_N1, BLOCK_SIZE_M1, MXFP4_QUANT_BLOCK_SIZE - ) - - out_offs_m = pid_m * BLOCK_SIZE_M1 + tl.arange(0, BLOCK_SIZE_M1) - # out_offs_m = x_offs_m - out_offs_n = pid_n * BLOCK_SIZE_N1 // 2 + tl.arange(0, BLOCK_SIZE_N1 // 2) - out_offs = out_offs_m[:, None] * stride_y_m + out_offs_n[None, :] * stride_y_n - - if EVEN_M_N: - tl.store(y_ptr + out_offs, y) - else: - out_mask = (out_offs_m < M)[:, None] & (out_offs_n < (N1 // 2))[None, :] - tl.store(y_ptr + out_offs, y, mask=out_mask) - - bs_offs_m = pid_m * BLOCK_SIZE_M1 + tl.arange(0, BLOCK_SIZE_M1) - # bs_offs_m = x_offs_m - bs_offs_n = pid_n * NUM_QUANT_BLOCKS + tl.arange(0, NUM_QUANT_BLOCKS) - if SHUFFLE: - bs_offs_0 = bs_offs_m[:, None] // 32 - bs_offs_1 = bs_offs_m[:, None] % 32 - bs_offs_2 = bs_offs_1 % 16 - bs_offs_1 = bs_offs_1 // 16 - bs_offs_3 = bs_offs_n[None, :] // 8 - bs_offs_4 = bs_offs_n[None, :] % 8 - bs_offs_5 = bs_offs_4 % 4 - bs_offs_4 = bs_offs_4 // 4 - bs_offs = ( - bs_offs_1 - + bs_offs_4 * 2 - + bs_offs_2 * 2 * 2 - + bs_offs_5 * 2 * 2 * 16 - + bs_offs_3 * 2 * 2 * 16 * 4 - + bs_offs_0 * 2 * 16 * scaleN - ) - bs_mask1 = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] - bs_mask = (bs_offs_m < scaleM_pad)[:, None] & (bs_offs_n < scaleN_pad)[ - None, : - ] - y_scale = tl.where(bs_mask1, y_scale, 127) - else: - bs_offs = ( - bs_offs_m[:, None] * stride_y_scale_m - + bs_offs_n[None, :] * stride_y_scale_n - ) - bs_mask = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] - if EVEN_M_N: - tl.store(y_scale_ptr + bs_offs, y_scale) - else: - tl.store( - y_scale_ptr + bs_offs, - y_scale, - mask=bs_mask, - ) diff --git a/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py b/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py index cdbbbadb8a..514f00cab6 100644 --- a/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py +++ b/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py @@ -13,18 +13,57 @@ _gemm_afp4wfp4_repr = make_kernel_repr( - "_gemm_afp4wfp4_kernel", + "_gemm_afp4_wfp4_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", "BLOCK_SIZE_K", "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", + "NUM_KSPLIT", + "SPLITK_BLOCK_SIZE", + "EVEN_K", "cache_modifier", + ], +) + + +_gemm_afp4wfp4_preshuffled_repr = make_kernel_repr( + "_gemm_afp4_wfp4_kernel_preshuffled_scales", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", "NUM_KSPLIT", + "SPLITK_BLOCK_SIZE", + "EVEN_K", + "cache_modifier", + ], +) + + +_gemm_afp4wfp4_preshuffled_weight_scales_repr = make_kernel_repr( + "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "NUM_KSPLIT", + "SPLITK_BLOCK_SIZE", + "EVEN_K", + "cache_modifier", + ], +) + + +_gemm_afp4wfp4_reduce_repr = make_kernel_repr( + "_gemm_afp4_wfp4_reduce_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "ACTUAL_KSPLIT", + "MAX_KSPLIT", ], ) @@ -37,7 +76,7 @@ } ) @triton.jit(repr=_gemm_afp4wfp4_repr) -def _gemm_afp4wfp4_kernel( +def _gemm_afp4_wfp4_kernel( a_ptr, b_ptr, c_ptr, @@ -65,10 +104,6 @@ def _gemm_afp4wfp4_kernel( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -189,23 +224,6 @@ def _gemm_afp4wfp4_kernel( tl.store(c_ptrs, c, mask=c_mask) -_gemm_afp4wfp4_preshuffle_scales_repr = make_kernel_repr( - "_gemm_afp4wfp4_preshuffle_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) - - @triton.heuristics( { "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) @@ -213,8 +231,8 @@ def _gemm_afp4wfp4_kernel( and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), } ) -@triton.jit(repr=_gemm_afp4wfp4_preshuffle_scales_repr) -def _gemm_afp4wfp4_kernel_preshuffle_scales( +@triton.jit(repr=_gemm_afp4wfp4_preshuffled_repr) +def _gemm_afp4_wfp4_kernel_preshuffled_scales( a_ptr, b_ptr, c_ptr, @@ -242,10 +260,6 @@ def _gemm_afp4wfp4_kernel_preshuffle_scales( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -415,23 +429,6 @@ def _gemm_afp4wfp4_kernel_preshuffle_scales( tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") -_gemm_afp4wfp4_preshuffle_repr = make_kernel_repr( - "_gemm_afp4wfp4_preshuffle_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) - - @triton.heuristics( { "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) @@ -439,8 +436,8 @@ def _gemm_afp4wfp4_kernel_preshuffle_scales( and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), } ) -@triton.jit(repr=_gemm_afp4wfp4_preshuffle_repr) -def _gemm_afp4wfp4_preshuffle_kernel( +@triton.jit(repr=_gemm_afp4wfp4_preshuffled_weight_scales_repr) +def _gemm_afp4_wfp4_kernel_preshuffled_weight_scales( a_ptr, b_ptr, c_ptr, @@ -468,10 +465,6 @@ def _gemm_afp4wfp4_preshuffle_kernel( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -532,10 +525,12 @@ def _gemm_afp4wfp4_preshuffle_kernel( offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak ) b_ptrs = b_ptr + ( - offs_bn[:, None] * stride_bn + offs_k_shuffle[None, :] * stride_bk + # offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + offs_bn[:, None] * stride_bn + + offs_k_shuffle[None, :] * stride_bk ) - # Create pointers for the first block of A and B scales + offs_asn = ( pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) ) % N @@ -650,20 +645,8 @@ def _gemm_afp4wfp4_preshuffle_kernel( tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") -_gemm_afp4wfp4_reduce_repr = make_kernel_repr( - "_gemm_afp4wfp4_reduce_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "ACTUAL_KSPLIT", - "MAX_KSPLIT", - ], -) - - -@triton.heuristics({}) # dummy heuristics to invoke kernel re-naming @triton.jit(repr=_gemm_afp4wfp4_reduce_repr) -def _gemm_afp4wfp4_reduce_kernel( +def _gemm_afp4_wfp4_reduce_kernel( c_in_ptr, c_out_ptr, M, diff --git a/aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py b/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4_pre_quant_atomic.py similarity index 83% rename from aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py rename to aiter/ops/triton/_triton_kernels/gemm_afp4wfp4_pre_quant_atomic.py index f0c2454821..0d27d412c6 100644 --- a/aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py +++ b/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4_pre_quant_atomic.py @@ -8,26 +8,25 @@ import torch import triton import triton.language as tl -from ..utils._triton.pid_preprocessing import pid_grid +from ..utils._triton.pid_preprocessing import pid_grid, remap_xcd from ..utils._triton import arch_info from ..utils.core import AITER_TRITON_CONFIGS_PATH -from ..utils._triton.kernel_repr import make_kernel_repr from .quant import _mxfp4_quant_op +from ..utils._triton.kernel_repr import make_kernel_repr -_gemm_a16wfp4_repr = make_kernel_repr( - "_gemm_a16wfp4_kernel", +_gemm_afp4wfp4_pre_quant_repr = make_kernel_repr( + "_gemm_afp4_wfp4_pre_quant_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", "BLOCK_SIZE_K", "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", "NUM_KSPLIT", + "SPLITK_BLOCK_SIZE", + "EVEN_K", + "GRID_MN", + "cache_modifier", ], ) @@ -41,8 +40,8 @@ * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), } ) -@triton.jit(repr=_gemm_a16wfp4_repr) -def _gemm_a16wfp4_kernel( +@triton.jit(repr=_gemm_afp4wfp4_pre_quant_repr) +def _gemm_afp4_wfp4_pre_quant_kernel( a_ptr, b_ptr, c_ptr, @@ -67,15 +66,11 @@ def _gemm_a16wfp4_kernel( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, - num_warps: tl.constexpr, - num_stages: tl.constexpr, - waves_per_eu: tl.constexpr, - matrix_instr_nonkdim: tl.constexpr, GRID_MN: tl.constexpr, - ATOMIC_ADD: tl.constexpr, cache_modifier: tl.constexpr, ): - """Kernel for computing the matmul C = A x B. + """ + Kernel for computing the matmul C = A x B. A and B inputs are in the microscale fp4 (mxfp4) format. A_scales and B_scales are in e8m0 format. A has shape (M, K), B has shape (K, N) and C has shape (M, N) @@ -100,6 +95,8 @@ def _gemm_a16wfp4_kernel( num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) if NUM_KSPLIT == 1: + remap_xcd(pid, GRID_MN) + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) else: pid_m = pid // num_pid_n @@ -107,8 +104,6 @@ def _gemm_a16wfp4_kernel( tl.assume(pid_m >= 0) tl.assume(pid_n >= 0) - tl.assume(pid_k >= 0) - # We assume 32 elements along K share the same scale. SCALE_GROUP_SIZE: tl.constexpr = 32 @@ -153,15 +148,10 @@ def _gemm_a16wfp4_kernel( b = tl.load(b_ptrs, cache_modifier=cache_modifier) else: a_bf16 = tl.load( - a_ptrs, - mask=offs_k_bf16[None, :] < 2 * K - k * BLOCK_SIZE_K, - other=0, + a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0 ) b = tl.load( - b_ptrs, - mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), - other=0, - cache_modifier=cache_modifier, + b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 ) a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) @@ -185,29 +175,7 @@ def _gemm_a16wfp4_kernel( + pid_k * stride_ck ) c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - # if pid == 0: - # tl.device_print("c", c) - if ATOMIC_ADD: - tl.atomic_add(c_ptrs, c, mask=c_mask, sem="relaxed") - else: - tl.store(c_ptrs, c, mask=c_mask) - - -_gemm_a16wfp4_preshuffle_repr = make_kernel_repr( - "_gemm_a16wfp4_preshuffle_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "num_warps", - "num_stages", - "waves_per_eu", - "matrix_instr_nonkdim", - "cache_modifier", - "NUM_KSPLIT", - ], -) + tl.atomic_add(c_ptrs, c, mask=c_mask, sem="relaxed") def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): @@ -252,7 +220,7 @@ def _get_config( if not hasattr(_get_config, "_config_dict"): dev = arch_info.get_device() _get_config._config_dict = {} - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16WFP4.json" + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_PREQUANT-AFP4WFP4.json" with open(fpath, "r") as file: config = json.load(file) _get_config._config_dict["default"] = config @@ -260,9 +228,7 @@ def _get_config( key = f"{N}_{K}" if key not in _get_config._config_dict.keys(): dev = arch_info.get_device() - fpath = ( - f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16WFP4-N={N}-K={2*K}.json" - ) + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_PREQUANT-AFP4WFP4-N={N}-K={2*K}.json" if os.path.exists(fpath): with open(fpath, "r") as file: config = json.load(file) @@ -270,10 +236,8 @@ def _get_config( else: key = "default" # fall back to default config - if M < 16: + if M < 32: config = _get_config._config_dict[key]["small"] - elif M < 32: - config = _get_config._config_dict[key]["small_M16"] elif M <= 128: BLK_M = triton.next_power_of_2(M) if BLK_M == 32: @@ -289,4 +253,19 @@ def _get_config( config = config.copy() + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + return config diff --git a/aiter/ops/triton/batched_gemm_a16wfp4.py b/aiter/ops/triton/batched_gemm_a16wfp4.py deleted file mode 100755 index a10cc66bea..0000000000 --- a/aiter/ops/triton/batched_gemm_a16wfp4.py +++ /dev/null @@ -1,198 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -from typing import Optional -import torch -import triton -import aiter.ops.triton.utils._triton.arch_info as arch_info -from aiter.ops.triton._triton_kernels.batched_gemm_a16wfp4 import ( - _batched_gemm_a16wfp4_reduce_kernel, - _batched_gemm_a16wfp4_kernel, - _get_config, -) -from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton.gemm_a16wfp4 import ( - get_splitk, -) - -_LOGGER = AiterTritonLogger() - -global _USE_GEMM_SPLITK_BF16 -_USE_GEMM_SPLITK_BF16 = False - - -def set_use_gemm_splitk_bf16(value: bool): - global _USE_GEMM_SPLITK_BF16 - _USE_GEMM_SPLITK_BF16 = value - - -def batched_gemm_a16wfp4( - x, - w, - w_scales, - dtype: Optional[float] = torch.bfloat16, - y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, - transpose_bm: Optional[bool] = False, - prequant: Optional[bool] = True, - y_scale: Optional[torch.Tensor] = None, -): - """ - Computes batched FP4 matrix multiplication Y[i] = X[i] @ W[i]^T with active activation quantization. - X is quantized to MXFP4 during computation, W is pre-quantized FP4. - - Args: - x (torch.Tensor): Higher precision input batch with shape (B, M, K) (BF16 or FP16). - Quantized to MXFP4 on-the-fly during GEMM. - w (torch.Tensor): FP4 E2M1 weight batch with shape (B, N, K), internally transposed. - w_scales (torch.Tensor): E8M0 per-group scale for w with shape (B, N, K//32). - One scale per 32 elements in K dimension. - dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). - y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (B, M, N). - config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, - BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). - transpose_bm (Optional[bool]): Transpose batch and M dimensions in output. - - - Returns: - torch.Tensor: Output batch with shape (B, M, N). - """ - _LOGGER.info( - f"BATCHED_GEMM_AFP4WFP_PREQUANT: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w.shape)}" - ) - - assert prequant is True, "prequant = False is not yet supported" - - assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" - - Bx, M, K = x.shape - Bw, N, K = w.shape - assert Bx == Bw - B = Bx - - if config is None: - config = _get_config(M, N, K) - - if y is None: - if transpose_bm: - y = torch.empty((M, B, N), dtype=dtype, device=x.device) - else: - y = torch.empty((B, M, N), dtype=dtype, device=x.device) - else: - if transpose_bm: - assert ( - y.shape[0] == M and y.shape[1] == B and y.shape[2] == N - ), f"Output dimension error {y.shape} {B} {M} {N}" - else: - assert ( - y.shape[0] == B and y.shape[1] == M and y.shape[2] == N - ), f"Output dimension error {y.shape} {B} {M} {N}" - - if config["NUM_KSPLIT"] > 1: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - - if _USE_GEMM_SPLITK_BF16: - y_pp = torch.empty( - (B, config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device - ) - else: - y_pp = torch.empty( - (B, config["NUM_KSPLIT"], M, N), - dtype=torch.float32, - device=y.device, - ) - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - y_pp = None - - if config["BLOCK_SIZE_K"] >= 2 * K: - config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) - config["SPLITK_BLOCK_SIZE"] = 2 * K - - if config["NUM_KSPLIT"] == 1: - stride_ck = 0 - stride_cn = y.stride(2) - if transpose_bm: - stride_cb = y.stride(1) - stride_cm = y.stride(0) - else: - stride_cb = y.stride(0) - stride_cm = y.stride(1) - else: - stride_cb = y_pp.stride(0) - stride_ck = y_pp.stride(1) - stride_cm = y_pp.stride(2) - stride_cn = y_pp.stride(3) - - grid = lambda META: ( # noqa: E731 - B, - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * triton.cdiv(N, META["BLOCK_SIZE_N"]) - ), - ) - _batched_gemm_a16wfp4_kernel[grid]( - x, - w, - y if config["NUM_KSPLIT"] == 1 else y_pp, - w_scales, - y_scale, - M, - N, - K, - x.stride(0), - x.stride(1), - x.stride(2), - w.stride(0), - w.stride(1), - w.stride(2), - stride_cb, - stride_ck, - stride_cm, - stride_cn, - w_scales.stride(0), - w_scales.stride(1), - w_scales.stride(2), - PRE_QUANT=prequant, - HAVE_Y_SCALE=(y_scale is not None), - **config, - ) - - if config["NUM_KSPLIT"] > 1: - REDUCE_BLOCK_SIZE_M = 16 - # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails - # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and - # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials - REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 - ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) - - grid_reduce = ( - B, - triton.cdiv(M, REDUCE_BLOCK_SIZE_M), - triton.cdiv(N, REDUCE_BLOCK_SIZE_N), - ) - _batched_gemm_a16wfp4_reduce_kernel[grid_reduce]( - y_pp, - y, - M, - N, - y_pp.stride(0), - y_pp.stride(1), - y_pp.stride(2), - y_pp.stride(3), - y.stride(0) if transpose_bm else y.stride(1), - y.stride(1) if transpose_bm else y.stride(0), - y.stride(2), - REDUCE_BLOCK_SIZE_M, - REDUCE_BLOCK_SIZE_N, - ACTUAL_KSPLIT, - config["NUM_KSPLIT"], - ) - return y diff --git a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py index b5376dc766..8679344856 100755 --- a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py +++ b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py @@ -11,9 +11,6 @@ _get_config, ) from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton.batched_gemm_a16wfp4 import ( - batched_gemm_a16wfp4, -) _LOGGER = AiterTritonLogger() @@ -34,9 +31,126 @@ def batched_gemm_afp4wfp4_pre_quant( y: Optional[torch.Tensor] = None, config: Optional[dict] = None, ): + """ + Computes batched FP4 matrix multiplication Y[i] = X[i] @ W[i]^T with active activation quantization. + X is quantized to MXFP4 during computation, W is pre-quantized FP4. + + Args: + x (torch.Tensor): Higher precision input batch with shape (B, M, K) (BF16 or FP16). + Quantized to MXFP4 on-the-fly during GEMM. + w (torch.Tensor): FP4 E2M1 weight batch with shape (B, N, K), internally transposed. + w_scales (torch.Tensor): E8M0 per-group scale for w with shape (B, N, K//32). + One scale per 32 elements in K dimension. + dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). + y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (B, M, N). + config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). + + Returns: + torch.Tensor: Output batch with shape (B, M, N). + """ _LOGGER.info( - "batched_gemm_afp4wfp4_pre_quant will be deprecated in future AITER release, please switch to batched_gemm_a16wfp4" + f"BATCHED_GEMM_AFP4WFP_PREQUANT: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w.shape)}" + ) + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + Bx, M, K = x.shape + Bw, N, K = w.shape + By, _, _ = y.shape + assert Bx == Bw == By + Batch = Bx + + if config is None: + config = _get_config(M, N, K) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (Batch, config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (Batch, config["NUM_KSPLIT"], M, N), + dtype=torch.float32, + device=y.device, + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + grid = lambda META: ( # noqa: E731 + Batch, + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), ) - return batched_gemm_a16wfp4( - x, w, w_scales, dtype, y, config, transpose_bm=False, prequant=True + _batched_gemm_afp4_wfp4_pre_quant_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + x.stride(2), + w.stride(0), + w.stride(1), + w.stride(2), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + y.stride(2) if config["NUM_KSPLIT"] == 1 else y_pp.stride(3), + w_scales.stride(0), + w_scales.stride(1), + w_scales.stride(2), + **config, ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + Batch, + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _batched_gemm_afp4_wfp4_pre_quant_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y_pp.stride(3), + y.stride(0), + y.stride(1), + y.stride(2), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) + return y diff --git a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json deleted file mode 100644 index 2778eb61a2..0000000000 --- a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "small": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - }, - "small_M16": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 8 - }, - "medium_M32": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 6, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 7 - }, - "medium_M64": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 7 - }, - "medium_M128": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 4, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 4 - }, - "large": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 4 - }, - "xlarge": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 8, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 1 - } -} diff --git a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16.json b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16.json deleted file mode 100644 index 21d51bd9a8..0000000000 --- a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "any": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - } -} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json deleted file mode 100644 index c356742b24..0000000000 --- a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "small": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "small_M16": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "any": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - } -} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json index 848e1c288f..94947a5a1a 100644 --- a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json @@ -76,18 +76,5 @@ "cache_modifier": null, "NUM_KSPLIT": 1, "kpack": 1 - }, - "any": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 32, - "cache_modifier": null, - "NUM_KSPLIT": 1, - "kpack": 1 } } diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=512-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=512-K=7168.json deleted file mode 100644 index f2a37990bc..0000000000 --- a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=512-K=7168.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "small": { - "BLOCK_SIZE_M": 4, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - }, - "medium_M32": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - }, - "medium_M64": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - }, - "medium_M128": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - }, - "large": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - }, - "xlarge": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 14 - } - -} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=7168-K=2048.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=7168-K=2048.json deleted file mode 100644 index a7d0e5319e..0000000000 --- a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=7168-K=2048.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "small": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 4 - }, - "small_M16": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 4 - }, - "medium_M32": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 4 - }, - "medium_M64": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 4, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "medium_M128": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 4, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 1 - }, - "large": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 4, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 1 - }, - "xlarge": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 1, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 1 - } -} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4.json deleted file mode 100644 index 87585de25d..0000000000 --- a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "small": { - "BLOCK_SIZE_M": 4, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "small_M16": { - "BLOCK_SIZE_M": 4, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "medium_M32": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "medium_M64": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "medium_M128": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "large": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": ".cg", - "NUM_KSPLIT": 1 - }, - "xlarge": { - "BLOCK_SIZE_M": 8, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 512, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 1, - "waves_per_eu": 2, - "matrix_instr_nonkdim": 16, - "cache_modifier": null, - "NUM_KSPLIT": 1 - } - -} diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a92d1d94da Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 6c9bbab49f..cf000a5aea 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "ac0db73ed121f17392580b800e40b9bb87fb9d0369e185933a9886c5f4583ee1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "4971efc8d6396be9b0df4db743227b0777c6aa214766b931a446d515ce1a8695", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..3170e87c52 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9134672e63..053718dae0 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "575dd304f117606679059d6cb68d955bb489a78beb0909257af4b0e192ad1f23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "f70c711c78f7418d6182e8cfd2b0d0211ab59b720b83dcbbfd09de4594147fb5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..339d8b60b3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 821efe3ad8..30a971a865 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "de64f305bd6b13d6f447e75a3f28c36ee5dc346e4732e083f0080fad0c11172e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "50630419988688add7ab5f7992729c367ac76cd80edb7ed14b1c4f86a6af5938", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0a9d32b9b3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index fdc102e3f2..d36b270856 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "08bad7427dda84af014b9fb0de8c2b1b41f77e3b364f03e5b35cd89c640a3e41", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "50524344e989cc18ac62628fc02d73d4163eb245c05d3868c90e9efe40f885ea", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..fd439c98f0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 826c8da5ae..2c41af514d 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "f66e735c364132c4a96f7ab74f2c7f24fa8c65d66df35f9fa0e3d69d1cbecb82", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "900c95ea5068e521cac115dba2f5a39c95629558de25edd9ed355a2bca806bc9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..1c6d43bf8b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index c160ecfc89..87d5d11a93 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "f6d74b8d7b05e98a86916eaf47d9226c4134e31fab17c2db49e2f0f187e1057b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "70389c6a931f80ad9dabdafe6366f140480dc46d3546e1a47f6e8038dffbbcbe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..fb4cfcdbb1 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ca4e8466cc..ad581d4eb7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "c63afbc7803e924742aa4ee3bb2a2cfc4a39c135b7e349061ce18485d365a293", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "bf4271e7a83f7f7e1c1b4d82c565f2c0599c4cbcf518e758b1992254783f6b47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..f4f2b5f242 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 010147f21e..9d8fbfacef 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "b72cad96794df04006b57f794600ac44aac42f119a19c470001e89ac5b6d044c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "576f5ce01ce40e9047a2afe995e3d1a807b4d8ba89ccfde6875bbcdf1bedc771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a07265a92c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9a0d6d26b6..ca1818a8b5 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "6d85fca9c7dda9dd3d31527b373b3c29db32528e788d6cb9143e0e81d6ae34af", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "87c999f39f544c659c4c4b3649617c5cfab67bc2a5df8a26c6227aa4cf4ea998", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..164a2daa55 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 0e2f5d06a6..4aea58cafe 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3fdba0f802a66c444714f4be12e76dc72fc690ff876052acc7cc46e5039ffa7d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file +{"hash": "7d4eee9825f37b996ae066357854ac66af67d8ff733626eb7ee22f12d90c425a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..11a81135b7 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 1ff26942cf..b62a824706 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "d0bbf4b796f5cdb38155e4196ee0ae86905a8fb5e602a193f2eff7a5e68d1c06", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "950f33955df085252de58736ed2fa6ca548cd5920743ecf8df8ce38db3a27bf1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..caf2d9663c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9b92656576..c3c9f54b2b 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1db6cd1a83ff815c7665cf291d373830bd1a143e696927015a8ee2b49ebcb854", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file +{"hash": "d794139ad9c7a2aa2f2fb6efaef5771241cd987dd8be123f9af2ee45a41127fa", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0f16aad417 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index c8c8b85ef8..0c64b2b637 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "2cac51523210d7e7e299993c9e25d1d44bddff40f03186798034d38fc430e9ac", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "3e3805a7ab6809b0520f39281f72e918acfd2568d5b7d1852b7aa65ff6dede2f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d216f4db45 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 389cb90b96..1282fab52e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "cf8a4eb32f6ee1258daeff2f719991d5e93a49a56b3d4e472ce1ce3ac1724886", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "b54b063171df37071e5d216a95968f9b0071bfc3f1dac8a6507d7d3412b3b2c2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4b472f387c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 4ca7039532..24580e0921 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "9fd3df79022457865bb0cf42f522518c6cd3e7b1ab219fb72a2e44a73264f9c8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "af947e9c60407171fac651b5e73064363155003f3949a1280322f28c0bc82174", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..1a354efcee Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 193ff13c31..b81c00af1a 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "cbaacc428925683de01130c1a14cb4a4b113da5e9e6dc35cba74a56c75a0cbcb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "5dccf75c4c6643db197699c190cf3be8883f25f5c1b82c171c1e1f9a5acf5a54", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4d9bd9f32e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index b1a3df2bc0..cf1de878a4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "6fd92a5704cafe8fc8128fcfeb9580a709cc96f90ab4bd77cb30eb36fff0719a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "1eea6a6f69efd9adbfc722daf6f70fbf96785d4c2536c27089af2750e93f1007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..effc769ed2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 153a1e017f..3e7d0dbdf7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3c10221760281745da88cf8135cfa607de0f063d4ac484b4f523c9c36e1c0ebd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "f2d1a87b6a5690047dc9744b4dcbf4e6ca2d231295a0eb21b4de4d2d57d5c452", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..3a94d7f3f9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index d3e5b004b4..2df2522e1c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "50d6ea63d4e28ea753cea3329b6ed6e5c066d3eabd224d8d425de7f4d3c364e2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "97cdf5b137cd798fc01173124f1fe7f434603233131809f90b9122692b5e0691", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5c9fa23455 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 19fa9e3080..f32f36bceb 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "bad417c8a82cc77f535d3fc28f0dc2a7427c276cb1dd6b0abb3ee12beca26028", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "7a8df9f76c7249c0727020baab6fc5a45bff3f61821a017af0bcdcb31a158d51", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2b7333aab5 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ca4e8466cc..6e600e607e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "c63afbc7803e924742aa4ee3bb2a2cfc4a39c135b7e349061ce18485d365a293", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "a098a54b9f5edd21bdbd84ba2ae5bc0f6493e3ae9e7fbe11ba4f755d7d33b2c8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9f9940f169 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 26e386be8b..f0dd4491d2 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "edd9a4dd479bd0e61aa73929bb69bdedeff06acec387784d606306865431f18c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "ed379549185fb90aacc990a30e95b69837b80e2ba48fcbe8b2c328df34d911d2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..b2a2cfed56 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 31b4af3827..779fd69d94 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "652620172805dfbc029f1d71f6020daeee787441d5002a470dde957124663162", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file +{"hash": "3f103db1b06fbcc5665da2d706abfc42ab421a1e2136147b1fb729db1aa0c1de", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..07d026c9bc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 53a17c95ad..644acdd15d 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "5d7c6d40c30976d49ffb61cbf961b7fe950a36c3a37bd6ba59cec67527c19145", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "c468b044688faad941ae6530c535e4dc5ccab9ec70b273112a98fe310e96fbe5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..517dd3009b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ed71d1dedb..c18ad7066a 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "a843505af9d0de09f24ecbd9fb47b0ce8532558484ed9f55e38acfed4f4aa00e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 1, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8192, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_4_num_stages_1_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "00a3f26ad3db5e526e2ffc540824d99e1c060a1d6a6d27796dc7b6d5e2f28128", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 1, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8192, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..75c2c0f392 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ca4e8466cc..74fa020cf3 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "c63afbc7803e924742aa4ee3bb2a2cfc4a39c135b7e349061ce18485d365a293", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "1f80213aa7e03eafff54b07fffeecfbe5013f46fdaad2a5092c34cccd87c2115", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d96d7107d0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 1285795a7d..343d8fdce0 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "8fc8efc57f3e712af9df23ab9931e5fd351bf1e02eea9fdca71a6a66937cf13e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "06c044aea0bc437a4798835deb75891e2ca4f556f7d00f2f3139a895210cbb8b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a69de9121f Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 18f2ff2202..d8d02b17f6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "72d78a34ddfbc40e0fb4f881355c529401dcdd0b759c87ef259f7bb2af00489a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "fddfd666ae3dc3d6b62572367cdeafad40d6e4e6bb921f30391f7428b1e1e338", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..786585e8e6 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index c8c8b85ef8..638a5fd66a 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "2cac51523210d7e7e299993c9e25d1d44bddff40f03186798034d38fc430e9ac", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "674006f4c8ea7904eecc04cb91ea7fd771ffb64c1b070a084ec21c03ffe1f1c2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..3f9b8bc0f8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index eeaf944c6e..5cc7a5b2d1 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "443ae274e193fd6043c642cb8976463c41d70d5404bfc373ae30cc7127999df1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "dc7719c21d6c20b721db205eed9d3b7e5b88c2259f331a26d538e2b9da4193f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..161cc7f778 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index a65774671f..0f45827489 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "87c7858153ab8e434bec427abdaee357f7585e21511220f34366b12f60fd68ea", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "40729b4e2ffb586195b561a7924416e04e15972dcf0e08b6be64b5979c49d7be", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4d9bd9f32e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index b1a3df2bc0..cf1de878a4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "6fd92a5704cafe8fc8128fcfeb9580a709cc96f90ab4bd77cb30eb36fff0719a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "1eea6a6f69efd9adbfc722daf6f70fbf96785d4c2536c27089af2750e93f1007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2a8f53e1ec Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 8d99b9dafd..b13e41cf45 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "420fc2afaf9c7fd3ed527fbe8ed836dcebd868d643eb727fd7cecda2e352dac1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d3f3b6944e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 3abb740476..94f8f302d4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "28ee3738457603540beaceed7d072d429f4a2c5a741ddb66bd51eae63ab350f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d854a26d44 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..18518c7114 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ddfa86a175 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..630d0cf7be --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..878752918d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index de5bb66c57..7235ff4fd4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e20a6504a51ac5fa2fa8a64817846b4ef6e2b28d01fef20a3daa90a66ff38771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..db028776cd Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index c160ecfc89..f2eef37457 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "f6d74b8d7b05e98a86916eaf47d9226c4134e31fab17c2db49e2f0f187e1057b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5f22b90628 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index a981e911b6..aa851bad5e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "d313ce59ac66783429f1fce91908e814f031eea49a81174adc8e34d0096a2ec5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1bb487e35 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 010147f21e..307d8cda4e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "b72cad96794df04006b57f794600ac44aac42f119a19c470001e89ac5b6d044c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..43952c536a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 309d8ac9e8..65d5835012 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "37d651c067f3f7312122c03fa9c7dbe80fe2afdd2f5fcd3bf83363c2363d2389", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9e5e0d0b33 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 21845be148..83ae9f44b8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "8676ab450247ae5f46c633e97f5e6d8ce947a706f4dba861a79ccdcea9b82c87", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file +{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e06ba3b5e8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..dfcc4c6fe5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0703f1c3c5 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 7966e519cc..441977fcda 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "0a47bc2419c29f318626dacd627dfb5495c1a5502f6538ec6c159de4bfd43f21", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..bf18531344 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 5370206b70..146ca2148f 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "ff17f59d450c0f6fd17e02dafe9880c20d9f0b50b2ea7671930c5fa093bdb1cb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97fc49e9ef Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7180f472b1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4474a4cbba Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c02d8df2eb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..625ebad0aa Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 5ddc74b606..56bed44a0c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "21134c6efa51cd932dd9df5b64c4b4b8baccaa9c75ecae3319de4d9c11304c13", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..6720a67345 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index df7471e61b..b94e9b5868 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "aba3b6ee938d70c1de141002417f1c921db453f895532640c9d2d65c780ab2b2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 4864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "69c533376c135f1466f40015aa8dbb2e47737901c0704ebae287d5a6c817625c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 4864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97be1f6400 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 0cb383c0c3..b9d277c8a3 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "182a8ed3e19738c0c42ca43be5de931da4416ebcd0509607b61372242c2a45a2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "ed3a75371cec725aed630ea65b4fa508941952f5e0c9471fa93107b230a4f03b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9bde266a37 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index e2f54b18e3..0c62a8849a 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "b22a5a1a4d14245543d7b2ad5bd91c1641772f593a921524c3e95329aa436da0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "ee7c8bc727b05b5294121866e6263da1994dc1ee7734e4888045d513b10cb4d4", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..59feb2de7f Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9f301d7816..41c58a031b 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "eb96864ca43b4613934d411fd2ea15dd9cfc555c57119d06c6f4f1f75dee56a8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "98a5f489f33182cd98a113529f0648b30b5042f9318b52363b3e9c54368c2f79", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1d87be86c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index f7360288f7..4a266b4bdb 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3df6d8326c4abd390dc4720816542d0605767741f96ee0230f8621937eb0fc5c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "ce7a874c46162d4a686ded6b749e772fdd69eb8099e1788200e24e28b3b714e6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a51b2294a3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index cab57e198e..ee907f79de 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "d3d6536e323275a8550d70f546c7abffe289db8eaff89278776a63fdf1a0064d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "cd35a510fafc921b16596ea3787adfac00ce14bd5ca2f8194c08a2d8ce625c63", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a70e0a6ed4 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index d9c0c6c203..70876a4dfe 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "ab30d0921fd33b047a432093f7a8eba01043359d171b7de45de9ab54a5133867", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "ab1b8066b3ca873af3ef3ef52ab68a54be546b27071022826927f25df768ad7f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0214033b0d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 4c59a2d328..afda0cc597 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "bd51efc6fc2ebfad453142081d544dad23bb8d0bf5f803c532e453e2b8e9f6b6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "d5ae75cc2a5e451f4541f51892bebc6bade5f9e0fc50a14924d5df9d3e862ab1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..6da53f98e2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 4e75183db5..80d0d783bd 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "152f10b4ee54c3b8887d02aadc4117b0fb2e5d53830b3b21aa1567f1ddd59952", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "dd02aea24f07a469be50cb48315080339bec4331fb29dc3ab324044e4fad83d9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7de88c2416 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 14544f3cdc..0cac508631 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "6f059192e4847d649f5acf872608c5075589dc0091433f3a449c866c4d89292b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "f2988946eb27a029d039bab8743ccf128ede0f91ff0bdcabd59a3d0a8737b90a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..839ba3f892 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9a9dfc37d7..47c0104cea 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "58afbabb9e2270e3dbad9690b5cc1153ce3d2f4f17fe6755ae7b6230f4a29233", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "3d1105e6d7346400d4523456e30259ea586690de52c26e4bd7a2c6fdbd75d2c5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e58dac5e89 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 18dcdf62c7..2be5a94c79 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "5157457c9163fe8de2015403edc1507123d499ca3c9acacdb7e7cd108b19551c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "7ece21689ec170d622f73f8dd019d6603006f843506aa999a4ba733398455007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5a5927322e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index a3355a0fac..23034f50b9 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "f057025d1f6a3f49c65ec89b7333757d4eb93f9615c6b919263fad3a110c104e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "0d0e2c47e7ca82c5ca8e47b5b51e21ae0139be3b70bc174af2be8545770544e0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..677161f2e2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 892737a390..e25967cdc2 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "383c9daf6dfa8ad66a127d9a5f1883a50a6752f4caa6ba98743981d30e943f52", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "f9bac69d0a515dec752a6cd3498979c6d2e5fa55f1f20c6c2e68de845e4c0709", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ab66b9aac0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 33719c4499..ba5641bae4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "96bc5c880ab21d11a6417760154aa4a08fe8f77df9c93f608b84f72cc0c0c7dd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "487a67f0a0313c1afa4b0aa5dbeee4606311eaad808e0e7b69875fcb29b1edb9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..06164a3817 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 5e48567136..52d755e6a7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "151f1abb8712ab497b6d023fb7b92094d081e09db716f99e2697b994486dbeb9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "491ea027fe915421c2c388460048bda1909bbb2fe234f26aa650cce8f2b1f5f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2a8f53e1ec Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 8d99b9dafd..b13e41cf45 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "420fc2afaf9c7fd3ed527fbe8ed836dcebd868d643eb727fd7cecda2e352dac1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d3f3b6944e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..94f8f302d4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d854a26d44 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..18518c7114 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ddfa86a175 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..630d0cf7be --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..878752918d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index de5bb66c57..7235ff4fd4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e20a6504a51ac5fa2fa8a64817846b4ef6e2b28d01fef20a3daa90a66ff38771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..db028776cd Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 3b1d8498f3..f2eef37457 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "c9e983b7947524e0a29c25c4a49650d79c688f11f50e9fe55d8bb698ba080920", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5f22b90628 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..aa851bad5e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1bb487e35 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 010147f21e..307d8cda4e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "b72cad96794df04006b57f794600ac44aac42f119a19c470001e89ac5b6d044c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..43952c536a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..65d5835012 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9e5e0d0b33 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 21845be148..83ae9f44b8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "8676ab450247ae5f46c633e97f5e6d8ce947a706f4dba861a79ccdcea9b82c87", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file +{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e06ba3b5e8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..dfcc4c6fe5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0703f1c3c5 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9b92656576..441977fcda 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1db6cd1a83ff815c7665cf291d373830bd1a143e696927015a8ee2b49ebcb854", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file +{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..bf18531344 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index c8c8b85ef8..146ca2148f 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "2cac51523210d7e7e299993c9e25d1d44bddff40f03186798034d38fc430e9ac", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97fc49e9ef Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7180f472b1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4474a4cbba Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c02d8df2eb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..625ebad0aa Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 193ff13c31..56bed44a0c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "cbaacc428925683de01130c1a14cb4a4b113da5e9e6dc35cba74a56c75a0cbcb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..1cf1324d04 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index f7360288f7..5f9f3d73c6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3df6d8326c4abd390dc4720816542d0605767741f96ee0230f8621937eb0fc5c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "8442b59cd54bfc72a3bed8dd9aacc04807eeff7f628d351975e80daeeb8c07b2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a582768be3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 0bd14848ab..227d75eb8e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "48fa7fa7492eba12b8281e38654ced4e5f477f1b2738c39249f262235ad8fb3a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8704, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "304af16ef6752d5164f9f17fd233db9fa50ab36dfb098ed207eeded7ff62fe2f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8704, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1d87be86c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..4a266b4bdb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ce7a874c46162d4a686ded6b749e772fdd69eb8099e1788200e24e28b3b714e6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7ab0ef739d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 6097e16fc7..14bfdc0ba7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "a1a79beada1434a829871ab8ffa102288d547bb1b733bf8e0eeedf73ef945fc5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 36864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "b01f43ede2d3f0f3d7058a795400af47f6d1e9602413dc8926d63d2c7056c74a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 36864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7b2dc1c3ec Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 87f0361e64..447c6a6175 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "6d7f5c3af319c96f4216c47475ab3aa2e51c4347bee4fd43e22f1c7238388fe9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "7526c8a1b2ea0dd2646354a0a3fc36c41ff4b1e46d1f24d4994f03ce10cbfd50", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4484a8d7ff Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index a7737a4c03..1a94f54cc4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "ef26c3fb40e808093f1c8fc9f2d002d13e55aece745120f4ab1a947bbe855d0b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "66c96ad140c5c0362b52113637538dabdb72593f77e280d7d9894f3e565863b8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco similarity index 51% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 2b8e65fa7d..29d777447c 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index fe312b2bae..e720580e2d 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "4e3f1a0c388291dc916ee60db0e4fdfbdafc65a5796cffe006b0dcfe249807fa", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "952567c0aad2f93638b1a9f6c7b73712b04bbcec87dcb455d62aba552ea88c23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97adaa1c6a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 15a82b2231..741b678759 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "392b2bc2d1007f9e6b6ebcc06b48c1f33156824fccbe6a30f4e0822c098c0599", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "5cf1d8f50dfd5ca23bd5a74ef61af0b11ac8a7954942d1cf721029569da9db61", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..f7aff5c000 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 959a5fa4f9..c737064a11 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "82f633dae5e5153b28a342ae0ab70f6e2d3e761aa346187b53647fd5ba0439a7", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "f7afa37c6f7c464bac61859ac2a2eb1845baf4a665c09d03dfb371135b442ed5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..89db920df9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 80b3562c8f..8264c424c9 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "545c02d58ef82bf6a0207d821fda959285d1982c8b1664c5deeb1a237d05e261", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 18432, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "ce178e99a4c7c1317d9d74002f85df6166b0ce8be2e81a0278015f94a98f8568", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 18432, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7bca47bc78 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 8034113d72..8d6754c157 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "700495532aeef8d16ba9f707f12b6448d71f1b844a7cf76150ac75b8cb748a27", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "5f77dc9bedad300205059b17200cec0a92128347a2e1163f97475d1ac61f36d0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco similarity index 51% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 37163ed7c2..e05feb3d94 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 1253b21f98..d9e06080a5 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "ec36099aa99e531107eb2c3c25f64c0c6971a8de889a5dc30fc4a2ae44a2b2cb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file +{"hash": "2ca8102db9b04514810267d567d9111a674640efa166eda7ac964f2c9b62e741", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a68caf2419 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..129f6593aa --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "b8c2841a693dc9b73f1792c733bccb56ed63fbad8b5b0818edf2ab6ab963dadd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2b1e91002c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 4dd051172b..fa4dfc7269 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "4343c8f411a01d056a4182188aef339479bebaea49e5fa68d89406a52e21a776", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "762edd1695475aecc55e561d12419f63581b9f2ba898b37a2b1b23bf316fc823", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..b8395db679 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..a88a1a528c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "e22eb2008573c5f836861cb1efaf8f54828bffbb6f2c6f1e1e1f96374e36c9c7", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5d30e79cfc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9ae9a9d63e..a64d2bad03 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "ee34536f286c36e118f1a52f2281dc95788c9890590b2d4afac4c918faf053f0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "f0ea00fc779e1822790bc4801a6bc5b8ca6c6859ecdeca7d368233cff1b06c66", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2a8f53e1ec Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 8d99b9dafd..b13e41cf45 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "420fc2afaf9c7fd3ed527fbe8ed836dcebd868d643eb727fd7cecda2e352dac1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d3f3b6944e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..94f8f302d4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d854a26d44 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..18518c7114 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ddfa86a175 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..630d0cf7be --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..878752918d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index de5bb66c57..7235ff4fd4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e20a6504a51ac5fa2fa8a64817846b4ef6e2b28d01fef20a3daa90a66ff38771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file +{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..db028776cd Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index c160ecfc89..f2eef37457 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "f6d74b8d7b05e98a86916eaf47d9226c4134e31fab17c2db49e2f0f187e1057b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5f22b90628 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..aa851bad5e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1bb487e35 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 5c07dd95d4..307d8cda4e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e456eba3a5b4830cde475cc44e7cef91ff21d809663a7ed0eeb0328af862b63f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..43952c536a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..65d5835012 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9e5e0d0b33 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 21845be148..83ae9f44b8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "8676ab450247ae5f46c633e97f5e6d8ce947a706f4dba861a79ccdcea9b82c87", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file +{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e06ba3b5e8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..dfcc4c6fe5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0703f1c3c5 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..441977fcda --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..bf18531344 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..146ca2148f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97fc49e9ef Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7180f472b1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4474a4cbba Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c02d8df2eb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..625ebad0aa Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 193ff13c31..56bed44a0c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "cbaacc428925683de01130c1a14cb4a4b113da5e9e6dc35cba74a56c75a0cbcb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file +{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index fb37bfec76..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 27028e374b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 93d8bead42..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 41c4b47847..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 835e251818..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 100b5bb85e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index f49047088e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 94243c9c4a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 49da0f7265..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index ac65871215..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index f64316fbed..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 8ed7dd1629..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 8ed16da6d9..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 04de5251bc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index b528710129..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index d82c2aeb50..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 3682538491..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index cbb9bd7d3e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 08acc59e8a..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "60c353bc69789c371fe574057f3c38a26d378b47a2aedcd108b6dc4c5488b6d1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index b1b4884cd7..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 30ab95328e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index be3502f644..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 2b46e5b0cd..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "abeb38cc5212739d26e48af1239c32e6bcd38acdeffa8934cfa0eec238ba914e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index a1ff2bcb9d..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 0445503797..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 36928ba196..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 70bbd03caa..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 22eff3d041..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 18737db4b3..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "b089cd526efdd85d4e0d99613711462c0a362de9e03d13e6315d3f097b7e953c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index b2668f40db..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 767fc69949..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 1b6ce9bac2..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 887b408f0a..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "e18e6682d0234bd280b9bd99053b708d8d38a4d60ca2d73a361cb9eed704766d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index d95a6a3734..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index b428691a4e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 3682538491..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 53844517b4..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 725ef6adfc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index aab0c22969..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 35666fde2b..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "b4aeabf30320553d6f69e0071a9a432edde5b09ddecb1d3c98819ec0d3457cfe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 433c6c5860..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 87e957c393..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 1c9266f454..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 4633c4d980..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 05d93400da..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 89264b015b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index eb127d475e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 719a1bad56..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 97c6c20cc3..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "7b410e9cdd75a5f3fe773f65a28d85bcc51d8eadf022829e0d45921fa0e52914", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 951de4909c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 0944d9dd34..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 740a857e7c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index ddf2559e4f..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "eee21ae42957c88dc5a42d31b397651349617feb51b740074745b249d75c0e37", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 59205a6f65..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 5c79d583fa..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "a1b9a6eac170c026febb882b2e6e012093b1cb60feab008afa954c773a1204f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 5b85fd6844..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index ccc1c910dc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index bd1a146c2b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 000b0d5ba4..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 4cfe758be9..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 28aa9941f1..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index fc9faf3808..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 247c5a37a2..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 3692d162f3..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index bd0113c69a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 755463aef2..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 38c63c0bce..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 0b2906cfb8..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 76f3e9b02a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 1b33ea85b8..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index d3ef79c3ce..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index f6be9762d4..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 53844517b4..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 725ef6adfc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 3abb740476..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "28ee3738457603540beaceed7d072d429f4a2c5a741ddb66bd51eae63ab350f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index aab0c22969..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 35666fde2b..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "b4aeabf30320553d6f69e0071a9a432edde5b09ddecb1d3c98819ec0d3457cfe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 433c6c5860..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index fdc102e3f2..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "08bad7427dda84af014b9fb0de8c2b1b41f77e3b364f03e5b35cd89c640a3e41", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 87e957c393..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 1c9266f454..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 4633c4d980..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 05d93400da..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 89264b015b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 9134672e63..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "575dd304f117606679059d6cb68d955bb489a78beb0909257af4b0e192ad1f23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index eb127d475e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 719a1bad56..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 97c6c20cc3..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "7b410e9cdd75a5f3fe773f65a28d85bcc51d8eadf022829e0d45921fa0e52914", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 951de4909c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 0944d9dd34..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 740a857e7c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index ddf2559e4f..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "eee21ae42957c88dc5a42d31b397651349617feb51b740074745b249d75c0e37", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 59205a6f65..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 5c79d583fa..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "a1b9a6eac170c026febb882b2e6e012093b1cb60feab008afa954c773a1204f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 5b85fd6844..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index d1f64d58f9..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 954817efbc..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "d2863f812ca9919f4cf0ec9e777284e06bca17f566144d7168d0e93e696797f3", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 3491d72086..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 28aa9941f1..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index b049873ef9..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 587a325f4a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index c1282756d6..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 0d51bb85f0..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 87a0a1022b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 3bb148b108..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index c4b6292a95..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 655ef5fe7f..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 761b5d9b9b..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "26c7c4a40dd58ce24cb2cc1673d5bcb989efcc5d9a15ac0edcb97126c51490ca", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index ad6f41ff11..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 4dbd1053f7..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 9634e61c25..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "25eac10645d889445bd67388060ec83cf57a83e17335c8ba18cd81ae18f27c02", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index e86d9ae9de..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 53844517b4..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 725ef6adfc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 3abb740476..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "28ee3738457603540beaceed7d072d429f4a2c5a741ddb66bd51eae63ab350f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index aab0c22969..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 35666fde2b..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "b4aeabf30320553d6f69e0071a9a432edde5b09ddecb1d3c98819ec0d3457cfe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 433c6c5860..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index fdc102e3f2..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "08bad7427dda84af014b9fb0de8c2b1b41f77e3b364f03e5b35cd89c640a3e41", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 87e957c393..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 1c9266f454..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 4633c4d980..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 05d93400da..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 89264b015b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 9134672e63..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "575dd304f117606679059d6cb68d955bb489a78beb0909257af4b0e192ad1f23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index eb127d475e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 719a1bad56..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 97c6c20cc3..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "7b410e9cdd75a5f3fe773f65a28d85bcc51d8eadf022829e0d45921fa0e52914", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 951de4909c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 9b92656576..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "1db6cd1a83ff815c7665cf291d373830bd1a143e696927015a8ee2b49ebcb854", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 0944d9dd34..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 740a857e7c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index ddf2559e4f..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "eee21ae42957c88dc5a42d31b397651349617feb51b740074745b249d75c0e37", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 59205a6f65..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json deleted file mode 100644 index 5c79d583fa..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "a1b9a6eac170c026febb882b2e6e012093b1cb60feab008afa954c773a1204f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco deleted file mode 100644 index 5b85fd6844..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco and /dev/null differ diff --git a/aiter/ops/triton/fused_fp8_quant.py b/aiter/ops/triton/fused_fp8_quant.py index c86d6d642e..42ca530f97 100644 --- a/aiter/ops/triton/fused_fp8_quant.py +++ b/aiter/ops/triton/fused_fp8_quant.py @@ -394,7 +394,7 @@ def fused_reduce_act_mul_fp8_group_quant( x2: (SPK, M, 2*N1), dtype = fp32. if x is 2-dim, - x: (M, 2*N1), dtype = fp16 or bf16. + x: (M, N2), dtype = fp16 or bf16. x2 must be None the kernel is essentially identical to aiter.ops.triton.activation.act_mul_and_fp8_group_quant @@ -412,9 +412,7 @@ def fused_reduce_act_mul_fp8_group_quant( y_scale: (M, cdiv(N1, group_size)), dtype = fp32 y2: (M, N2), dtype = dtype """ - _LOGGER.info( - f"FUSED_REDUCTION_ACT_MUL_FP8_GROUP_QUANT: x={tuple(x.shape)} activation={activation}" - ) + _LOGGER.info(f"FUSED_REDUCTION_ACT_MUL_FP8_GROUP_QUANT: x={tuple(x.shape)}") assert ( x.dim() == 2 or x.dim() == 3 diff --git a/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py b/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py index 90b50bd066..40f5f62633 100644 --- a/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py +++ b/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py @@ -47,13 +47,12 @@ def fused_gemm_a8w8_blockscale_a16w16( - x_fp8_scale: Scale tensor for X with shape (M, *scale_k). - w_fp8_scale: Scale tensor for W with shape (**scale_n, *scale_k). - x_bf16: Matrix X with shape (M, K). - - w_bf16: Matrix W with shape (N_bf16, K). + - w_bf16: Matrix W with shape (N_fp8, K). Note: M, N, K must be identical for x_fp8 and x_bf16, but the N-dim fow w_fp8 and w_bf16 can be different Returns: - - y_fp8: The output matrix with shape (M, N_fp8). - - y_bf16: The output matrix with shape (M, N_bf16). + - Y: The output matrix with shape (M, N). *scale_k = (K + scale_block_size_k - 1) // scale_block_size_k **scale_n = (N_fp8 + scale_block_size_n - 1) // scale_block_size_n @@ -75,7 +74,7 @@ def fused_gemm_a8w8_blockscale_a16w16( x_fp8.shape[1] == x_bf16.shape[1] ), "K-dim should be identical for x_fp8 and x_bf16" assert x_fp8.shape[1] == w_fp8.shape[1], "Incompatible dimensions!!!" - assert x_bf16.shape[1] == w_bf16.shape[1], "Incompatible dimensions!!!" + assert w_bf16.shape[1] == w_bf16.shape[1], "Incompatible dimensions!!!" # Transpose w and w_scale w_fp8 = w_fp8.T diff --git a/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py b/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py deleted file mode 100644 index 5dfeb5737e..0000000000 --- a/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py +++ /dev/null @@ -1,258 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -from typing import Optional -import functools -import json -import os -import torch -import triton -import triton.language as tl -from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton._triton_kernels.fused_gemm_afp4wfp4_a16w16 import ( - _fused_gemm_afp4wfp4_a16w16_kernel, - _fused_gemm_afp4wfp4_preshuffle_a16w16_kernel, - _fused_gemm_afp4wfp4_a16w16_reduce_kernel, - _get_config, -) -from aiter.ops.triton.gemm_afp4wfp4 import get_splitk -from .utils.core import AITER_TRITON_CONFIGS_PATH - -import os -from aiter.utility.triton.triton_metadata_redirect import AOTMetadataContext - -_LOGGER = AiterTritonLogger() - - -def fused_gemm_afp4wfp4_a16w16( - x_fp4: torch.Tensor, - w_fp4: torch.Tensor, - x_fp4_scale: torch.Tensor, - w_fp4_scale: torch.Tensor, - x_bf16: torch.Tensor, - w_bf16: torch.Tensor, - is_fp4_preshuffled: bool = True, - bias_fp4: Optional[torch.Tensor] = None, - bias_bf16: Optional[torch.Tensor] = None, - dtype: Optional[float] = torch.bfloat16, - y_fp4: Optional[torch.Tensor] = None, - y_bf16: Optional[torch.Tensor] = None, - skip_reduce: Optional[bool] = False, - config: Optional[dict] = None, - use_aot: Optional[bool] = True, -): - """ - Computes the 8 bit matmul Y = X x WT + B using the block-scale quantization approach for x_fp4 and w_fp4. - Computes the 16 bit matmul Y = X x WT + B for x_bf16 and w_bf16 - - This fusion is primarily aiming for fusing the gate up-projections and MOE gating: - gate up-projections: (M, K) x (2N, K) = (M, 2N) - MOE gating: (M, K) x (N, K) + (N, ) = (M, N) - - Key parameters: - - x_fp4: Matrix X with shape (M, K). - - w_fp4: Matrix W with shape (N_fp4, K). - - x_fp4_scale: Scale tensor for X with shape (M, K // 32) - - w_fp4_scale: Scale tensor for W with shape (N, K // 32) - - x_bf16: Matrix X with shape (M, K). - - w_bf16: Matrix W with shape (N_bf16, K). - - Note: M, N, K must be identical for x_fp4 and x_bf16, but the N-dim fow w_fp4 and w_bf16 can be different - - Returns: - - y_fp4: The output matrix with shape (M, N_fp4). - - y_bf16: The output matrix with shape (M, N_bf16). - - """ - _LOGGER.info( - f"FUSED_GEMM_A8W8_BLOCKSCALE_A16W16: x_fp4={tuple(x_fp4.shape)} w_fp4={tuple(w_fp4.shape)} x_fp4_scale={tuple(x_fp4_scale.shape)} w_fp4_scale={tuple(w_fp4_scale.shape)} x_bf16={tuple(x_bf16.shape)} w_bf16={tuple(w_bf16.shape)}" - ) - - M, K = x_fp4.shape - N_fp4, K = w_fp4.shape - if is_fp4_preshuffled: - N_fp4 = N_fp4 * 16 - K = K // 16 - M, _ = x_bf16.shape - N_bf16, _ = w_bf16.shape - - # Check constraints. - assert ( - x_fp4.shape[0] == x_bf16.shape[0] - ), "M-dim should be identical for x_fp4 and x_bf16" - assert ( - x_fp4.shape[1] * 2 == x_bf16.shape[1] - ), "K-dim should be identical for x_fp4 and x_bf16" - if is_fp4_preshuffled: - assert x_fp4.shape[1] == w_fp4.shape[1] // 16, "Incompatible dimensions!!!" - else: - assert x_fp4.shape[1] == w_fp4.shape[1], "Incompatible dimensions!!!" - assert x_bf16.shape[1] == w_bf16.shape[1], "Incompatible dimensions!!!" - - # Transpose w and w_scale - if not is_fp4_preshuffled: - w_fp4 = w_fp4.T - w_bf16 = w_bf16.T - - if config is None: - config = _get_config(M, N_fp4, N_bf16, K, is_fp4_preshuffled) - - if config["NUM_KSPLIT"] > 1: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - config["NUM_KSPLIT"] = triton.cdiv(K, config["SPLITK_BLOCK_SIZE"] // 2) - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - - if config["BLOCK_SIZE_K"] >= 2 * K: - config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) - config["SPLITK_BLOCK_SIZE"] = 2 * K - config["NUM_KSPLIT"] = 1 - - if y_fp4 is None and (config["NUM_KSPLIT"] == 1 or not skip_reduce): - y_fp4 = torch.empty((M, N_fp4), dtype=dtype, device=x_fp4.device) - - if y_bf16 is None and (config["NUM_KSPLIT"] == 1 or not skip_reduce): - y_bf16 = torch.empty((M, N_bf16), dtype=dtype, device=x_bf16.device) - - if config["NUM_KSPLIT"] > 1: - y_fp4_pp = torch.empty( - (config["NUM_KSPLIT"], M, N_fp4), - dtype=torch.float32, - device=x_fp4.device, - ) - y_bf16_pp = torch.empty( - (config["NUM_KSPLIT"], M, N_bf16), - dtype=torch.float32, - device=x_bf16.device, - ) - else: - y_fp4_pp = None - y_bf16_pp = None - - config["BLOCK_SIZE_N"] = max(config["BLOCK_SIZE_N"], 32) - if is_fp4_preshuffled: - if M < 32: - assert ( - config["BLOCK_SIZE_M"] <= 16 - ), "for M < 32, BLOCK_SIZE_M must be 16 or less as x_scale are assumed to be un-shuffled" - else: - assert ( - config["BLOCK_SIZE_M"] >= 32 - ), "for M >= 32, BLOCK_SIZE_M must be 32 or more as x_scale are assumed to be preshuffled" - - grid = lambda META: ( # noqa: E731 - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * ( - triton.cdiv(N_fp4, META["BLOCK_SIZE_N"]) - + triton.cdiv(N_bf16, META["BLOCK_SIZE_N"]) - ) - ), - ) - selected_kernel = ( - _fused_gemm_afp4wfp4_preshuffle_a16w16_kernel - if is_fp4_preshuffled - else _fused_gemm_afp4wfp4_a16w16_kernel - ) - - def selected_kernel_wrapper(): - selected_kernel[grid]( - x_fp4, - w_fp4, - bias_fp4, - x_fp4_scale, - w_fp4_scale, - y_fp4 if config["NUM_KSPLIT"] == 1 else y_fp4_pp, - x_bf16, - w_bf16, - bias_bf16, - y_bf16 if config["NUM_KSPLIT"] == 1 else y_bf16_pp, - M, - N_fp4, - N_bf16, - K, - x_fp4.stride(0), - x_fp4.stride(1), - w_fp4.stride(0), - w_fp4.stride(1), - x_fp4_scale.stride(0), - x_fp4_scale.stride(1), - w_fp4_scale.stride(0), - w_fp4_scale.stride(1), - 0 if config["NUM_KSPLIT"] == 1 else y_fp4_pp.stride(0), - y_fp4.stride(0) if config["NUM_KSPLIT"] == 1 else y_fp4_pp.stride(1), - y_fp4.stride(1) if config["NUM_KSPLIT"] == 1 else y_fp4_pp.stride(2), - x_bf16.stride(0), - x_bf16.stride(1), - w_bf16.stride(0), - w_bf16.stride(1), - 0 if config["NUM_KSPLIT"] == 1 else y_bf16_pp.stride(0), - y_bf16.stride(0) if config["NUM_KSPLIT"] == 1 else y_bf16_pp.stride(1), - y_bf16.stride(1) if config["NUM_KSPLIT"] == 1 else y_bf16_pp.stride(2), - ADD_BIAS_FP4=(bias_fp4 is not None), - ADD_BIAS_BF16=(bias_bf16 is not None), - SKIP_REDUCE=skip_reduce, - **config, - ) - - M_POW2 = triton.next_power_of_2(M) - if M < 32 and M_POW2 > 16: - M_POW2 = 16 - metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{selected_kernel.fn.__name__}_M={M_POW2}-N4={N_fp4}-N16={N_bf16}-K={K*2}" - if use_aot and os.path.exists(metadata_pth): - with AOTMetadataContext( - selected_kernel.fn.__name__, - f"{metadata_pth}", - ): - selected_kernel_wrapper() - else: - selected_kernel_wrapper() - - if config["NUM_KSPLIT"] > 1: - if skip_reduce: - return y_fp4_pp, y_bf16_pp - REDUCE_BLOCK_SIZE_M = 32 - REDUCE_BLOCK_SIZE_N = 32 - ACTUAL_KSPLIT = triton.cdiv(K, config["SPLITK_BLOCK_SIZE"] // 2) - - grid_reduce = ( - triton.cdiv(M, REDUCE_BLOCK_SIZE_M), - triton.cdiv(N_fp4, REDUCE_BLOCK_SIZE_N) - + triton.cdiv(N_bf16, REDUCE_BLOCK_SIZE_N), - ) - _fused_gemm_afp4wfp4_a16w16_reduce_kernel[grid_reduce]( - bias_fp4, - y_fp4_pp, - y_fp4, - bias_bf16, - y_bf16_pp, - y_bf16, - M, - N_fp4, - N_bf16, - y_fp4_pp.stride(0), - y_fp4_pp.stride(1), - y_fp4_pp.stride(2), - y_fp4.stride(0), - y_fp4.stride(1), - y_bf16_pp.stride(0), - y_bf16_pp.stride(1), - y_bf16_pp.stride(2), - y_bf16.stride(0), - y_bf16.stride(1), - REDUCE_BLOCK_SIZE_M, - REDUCE_BLOCK_SIZE_N, - ACTUAL_KSPLIT, - triton.next_power_of_2(config["NUM_KSPLIT"]), - ADD_BIAS_FP4=(bias_fp4 is not None), - ADD_BIAS_BF16=(bias_bf16 is not None), - ) - - return y_fp4, y_bf16 diff --git a/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py b/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py deleted file mode 100644 index 3d5d3d8640..0000000000 --- a/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py +++ /dev/null @@ -1,445 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -from typing import Optional, Union -import torch -import triton -import triton.language as tl -import aiter.ops.triton.utils._triton.arch_info as arch_info -from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton._triton_kernels.fused_gemm_afp4wfp4_mul_add import ( - _fused_gemm_afp4wfp4_mul_add_kernel, - _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel, - _fused_gemm_afp4wfp4_mul_add_reduce_kernel, - _get_config, -) -from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( - _gemm_afp4wfp4_reduce_kernel, -) -from .utils.core import AITER_TRITON_CONFIGS_PATH - -import os -from aiter.utility.triton.triton_metadata_redirect import AOTMetadataContext - -_LOGGER = AiterTritonLogger() - -global _USE_GEMM_SPLITK_BF16 -_USE_GEMM_SPLITK_BF16 = False - - -def set_use_gemm_splitk_bf16(value: bool): - global _USE_GEMM_SPLITK_BF16 - _USE_GEMM_SPLITK_BF16 = value - - -def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): - # heuristics for make "EVEN_K == True" as much as possible - NUM_KSPLIT_STEP = 2 - BLOCK_SIZE_K_STEP = 2 - SPLITK_BLOCK_SIZE = ( - triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K - ) - while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: - if ( - K % (SPLITK_BLOCK_SIZE // 2) == 0 - and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 - and K % (BLOCK_SIZE_K // 2) == 0 - ): - break - elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: - NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP - elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: - if NUM_KSPLIT > 1: - NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP - elif BLOCK_SIZE_K > 16: - BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP - elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: - BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP - else: - break - - SPLITK_BLOCK_SIZE = ( - triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K - ) - - return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT - - -def fused_gemm_afp4wfp4_mul_add( - x, - w, - x_scales, - w_scales, - a: Union[torch.Tensor, float, int], - b: Union[torch.Tensor, float, int], - dtype: Optional[float] = torch.bfloat16, - y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, - fuse_type: Optional[int] = 0, -): - """ - Computes matrix multiplication Y = X @ W^T with FP4 activations and FP4 weights. - if fuse_type == 0: - the final output = a * Y + b - elif fuse_type == 1 - the final output = a * b + Y - - Args: - x (torch.Tensor): FP4 E2M1 input matrix with shape (M, K). - w (torch.Tensor): FP4 E2M1 weight matrix with shape (N, K), internally transposed. - x_scales (torch.Tensor): E8M0 per-group scale for x with shape (M, K//32). - One scale per 32 elements in K dimension. - w_scales (torch.Tensor): E8M0 per-group scale for w with shape (N, K//32). - One scale per 32 elements in K dimension. - dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). - y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). - config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, - BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). - - Returns: - torch.Tensor: Output with shape (M, N). - """ - - _LOGGER.info( - f"GEMM_AFPWFP4: x.shape={tuple(x.shape)} w.shape={tuple(w.shape)} x_scale={tuple(x_scales.shape)} w_scale={tuple(w_scales.shape)} " - ) - - if isinstance(a, float) or isinstance(a, int): - IS_A_SCALAR = True - IS_A_TENSOR = False - elif isinstance(a, torch.Tensor) and a.is_contiguous(): - IS_A_TENSOR = True - if a.numel() == 1: - IS_A_SCALAR = True - else: - IS_A_SCALAR = False - if isinstance(b, float) or isinstance(b, int): - IS_B_SCALAR = True - IS_B_TENSOR = False - elif isinstance(b, torch.Tensor) and b.is_contiguous(): - IS_B_TENSOR = True - if b.numel() == 1: - IS_B_SCALAR = True - else: - IS_B_SCALAR = False - - assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" - - M, K = x.shape - N, K = w.shape - - # Transpose w - w = w.T - - if y is None: - y = torch.empty((M, N), dtype=dtype, device=x.device) - - if config is None: - config = _get_config(M, N, K) - - if config["NUM_KSPLIT"] > 1: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - - if _USE_GEMM_SPLITK_BF16: - y_pp = torch.empty( - (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device - ) - else: - y_pp = torch.empty( - (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device - ) - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - y_pp = None - - grid = lambda META: ( # noqa: E731 - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * triton.cdiv(N, META["BLOCK_SIZE_N"]) - ), - ) - _fused_gemm_afp4wfp4_mul_add_kernel[grid]( - x, - w, - y if config["NUM_KSPLIT"] == 1 else y_pp, - x_scales, - w_scales, - a, - b, - M, - N, - K, - x.stride(0), - x.stride(1), - w.stride(0), - w.stride(1), - 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), - y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), - y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), - x_scales.stride(0), - x_scales.stride(1), - w_scales.stride(0), - w_scales.stride(1), - 0 if IS_A_SCALAR else a.stride(0), - 0 if IS_A_SCALAR else a.stride(1), - 0 if IS_B_SCALAR else b.stride(0), - 0 if IS_B_SCALAR else b.stride(1), - IS_A_SCALAR=IS_A_SCALAR, - IS_B_SCALAR=IS_B_SCALAR, - IS_A_TENSOR=IS_A_TENSOR, - IS_B_TENSOR=IS_B_TENSOR, - FUSE_TYPE=fuse_type, - **config, - ) - - if config["NUM_KSPLIT"] > 1: - REDUCE_BLOCK_SIZE_M = 16 - # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails - # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and - # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials - REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 - ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) - - grid_reduce = ( - triton.cdiv(M, REDUCE_BLOCK_SIZE_M), - triton.cdiv(N, REDUCE_BLOCK_SIZE_N), - ) - _fused_gemm_afp4wfp4_mul_add_reduce_kernel[grid_reduce]( - y_pp, - y, - a, - b, - M, - N, - y_pp.stride(0), - y_pp.stride(1), - y_pp.stride(2), - y.stride(0), - y.stride(1), - 0 if IS_A_SCALAR else a.stride(0), - 0 if IS_A_SCALAR else a.stride(1), - 0 if IS_B_SCALAR else b.stride(0), - 0 if IS_B_SCALAR else b.stride(1), - REDUCE_BLOCK_SIZE_M, - REDUCE_BLOCK_SIZE_N, - ACTUAL_KSPLIT, - triton.next_power_of_2(config["NUM_KSPLIT"]), - IS_A_SCALAR=IS_A_SCALAR, - IS_B_SCALAR=IS_B_SCALAR, - IS_A_TENSOR=IS_A_TENSOR, - IS_B_TENSOR=IS_B_TENSOR, - FUSE_TYPE=fuse_type, - ) - - return y - - -def fused_gemm_afp4wfp4_preshuffle_add_mul( - x, - w, - x_scales, - w_scales, - a: Union[torch.Tensor, float, int], - b: Union[torch.Tensor, float, int], - dtype: Optional[float] = torch.bfloat16, - y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, - use_aot: Optional[bool] = True, - fuse_type: Optional[int] = 0, -): - """ - Computes matrix multiplication Y = X @ W^T with FP4 activations and FP4 weights using preshuffled weight scales. - Weight matrix and scales are stored in optimized layout for improved performance. - if fuse_type == 0: - The final output = a * Y + b - elif fuse_type == 1 - The final output = a * b + Y - - Args: - x (torch.Tensor): FP4 E2M1 input matrix with shape (M, K). - w (torch.Tensor): FP4 E2M1 weight matrix with shape (N//16, K*16), internally transposed. - Preshuffled layout: logical shape after unpacking is (N, K). - x_scales (torch.Tensor): E8M0 per-group scale for x with shape (M//32, K) if M >= 32, - or (M, K//32) if M < 32. - w_scales (torch.Tensor): E8M0 per-group scale for w with shape (N//32, K). - Groups of 32 rows in N dimension share K scales. - dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). - y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). - config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, - BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). - use_aot (Optional[bool]): Enable ahead-of-time compilation metadata. - - Returns: - torch.Tensor: Output with shape (M, N). - """ - - assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" - - if isinstance(a, float) or isinstance(a, int): - IS_A_SCALAR = True - IS_A_TENSOR = False - elif isinstance(a, torch.Tensor) and a.is_contiguous(): - IS_A_TENSOR = True - if a.numel() == 1: - IS_A_SCALAR = True - else: - IS_A_SCALAR = False - if isinstance(b, float) or isinstance(b, int): - IS_B_SCALAR = True - IS_B_TENSOR = False - elif isinstance(b, torch.Tensor) and b.is_contiguous(): - IS_B_TENSOR = True - if b.numel() == 1: - IS_B_SCALAR = True - else: - IS_B_SCALAR = False - - M, K = x.shape - N, K = w.shape - N = N * 16 - K = K // 16 - - if y is None: - y = torch.empty((M, N), dtype=dtype, device=x.device) - - if config is None: - config = _get_config(M, N, K, True) - - if config["NUM_KSPLIT"] > 1: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - - if _USE_GEMM_SPLITK_BF16: - y_pp = torch.empty( - (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device - ) - else: - y_pp = torch.empty( - (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device - ) - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - y_pp = None - - if config["BLOCK_SIZE_K"] >= 2 * K: - config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) - config["SPLITK_BLOCK_SIZE"] = 2 * K - - config["BLOCK_SIZE_N"] = max(config["BLOCK_SIZE_N"], 32) - if M < 32: - assert ( - config["BLOCK_SIZE_M"] <= 16 - ), "for M < 32, BLOCK_SIZE_M must be 16 or less as x_scale are assumed to be un-shuffled" - else: - assert ( - config["BLOCK_SIZE_M"] >= 32 - ), "for M >= 32, BLOCK_SIZE_M must be 32 or more as x_scale are assumed to be preshuffled" - - grid = lambda META: ( # noqa: E731 - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * triton.cdiv(N, META["BLOCK_SIZE_N"]) - ), - ) - - def kernel_wrapper(): - _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel[grid]( - x, - w, - y if config["NUM_KSPLIT"] == 1 else y_pp, - x_scales, - w_scales, - a, - b, - M, - N, - K, - x.stride(0), - x.stride(1), - w.stride(0), - w.stride(1), - 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), - y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), - y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), - x_scales.stride(0), - x_scales.stride(1), - w_scales.stride(0), - w_scales.stride(1), - 0 if IS_A_SCALAR else a.stride(0), - 0 if IS_A_SCALAR else a.stride(1), - 0 if IS_B_SCALAR else b.stride(0), - 0 if IS_B_SCALAR else b.stride(1), - IS_A_SCALAR=IS_A_SCALAR, - IS_B_SCALAR=IS_B_SCALAR, - IS_A_TENSOR=IS_A_TENSOR, - IS_B_TENSOR=IS_B_TENSOR, - FUSE_TYPE=fuse_type, - **config, - ) - - M_POW2 = triton.next_power_of_2(M) - if M < 32 and M_POW2 > 16: - M_POW2 = 16 - metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_fused_gemm_afp4wfp4_preshuffle_mul_add_kernel.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" - if use_aot and os.path.exists(metadata_pth): - with AOTMetadataContext( - _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel.fn.__name__, - f"{metadata_pth}", - ): - kernel_wrapper() - else: - kernel_wrapper() - - if config["NUM_KSPLIT"] > 1: - REDUCE_BLOCK_SIZE_M = 16 - # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails - # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and - # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials - REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 - ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) - - grid_reduce = ( - triton.cdiv(M, REDUCE_BLOCK_SIZE_M), - triton.cdiv(N, REDUCE_BLOCK_SIZE_N), - ) - _fused_gemm_afp4wfp4_mul_add_reduce_kernel[grid_reduce]( - y_pp, - y, - a, - b, - M, - N, - y_pp.stride(0), - y_pp.stride(1), - y_pp.stride(2), - y.stride(0), - y.stride(1), - 0 if IS_A_SCALAR else a.stride(0), - 0 if IS_A_SCALAR else a.stride(1), - 0 if IS_B_SCALAR else b.stride(0), - 0 if IS_B_SCALAR else b.stride(1), - REDUCE_BLOCK_SIZE_M, - REDUCE_BLOCK_SIZE_N, - ACTUAL_KSPLIT, - triton.next_power_of_2(config["NUM_KSPLIT"]), - IS_A_SCALAR=IS_A_SCALAR, - IS_B_SCALAR=IS_B_SCALAR, - IS_A_TENSOR=IS_A_TENSOR, - IS_B_TENSOR=IS_B_TENSOR, - FUSE_TYPE=fuse_type, - ) - - return y diff --git a/aiter/ops/triton/fused_mxfp4_quant.py b/aiter/ops/triton/fused_mxfp4_quant.py index 8e19148e53..141bf6d2fe 100644 --- a/aiter/ops/triton/fused_mxfp4_quant.py +++ b/aiter/ops/triton/fused_mxfp4_quant.py @@ -1,4 +1,3 @@ -from typing import Literal import torch import triton import triton.language as tl @@ -8,10 +7,6 @@ _rmsmorm_op, _fused_rms_mxfp4_quant_kernel, _fused_flatten_mxfp4_quant, - _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel, -) -from aiter.ops.triton._triton_kernels.activation import ( - _get_activation_from_str, ) from aiter.ops.triton.utils.logger import AiterTritonLogger @@ -28,7 +23,6 @@ def fused_rms_mxfp4_quant( res1: Optional[torch.Tensor] = None, shuffle: Optional[bool] = False, scale_shuffle_padding: Optional[bool] = False, - output_unquantized_inp1=False, ): """ This op contains several steps: @@ -84,12 +78,6 @@ def fused_rms_mxfp4_quant( device=x1.device, ) - out1 = None - out1_stride_m = 0 - if output_unquantized_inp1: - out1 = torch.empty((M, N1), dtype=x1.dtype, device=x1.device) - out1_stride_m = out1.stride(0) - out_res1 = None res1_stride_m = 0 out_res1_stride_m = 0 @@ -117,7 +105,6 @@ def fused_rms_mxfp4_quant( out1_bs, out2, out_res1, - out1, x1_epsilon, x2_epsilon, M, @@ -130,14 +117,12 @@ def fused_rms_mxfp4_quant( *out1_bs.stride(), out2_stride_m, out_res1_stride_m, - out1_stride_m, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_N2=BLOCK_SIZE_N2, MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, HAS_SECOND_INPUT=(x2 is not None), FIRST_INPUT_RES=(res1 is not None), - FIRST_INPUT_OUT=output_unquantized_inp1, SCALE_N=SCALE_N_valid, SCALE_M_PAD=(SCALE_M if use_scale_shuffle_padding else 1), SCALE_N_PAD=SCALE_N, @@ -145,7 +130,7 @@ def fused_rms_mxfp4_quant( SHUFFLE_PAD=use_scale_shuffle_padding, ) - return (out1_fp4, out1_bs), out1, out2, out_res1 + return (out1_fp4, out1_bs), out2, out_res1 def fused_flatten_mxfp4_quant( @@ -191,179 +176,3 @@ def fused_flatten_mxfp4_quant( ) return out, out_block_scales - - -def fused_reduce_act_mul_and_mxfp4_quant( - x: torch.Tensor, - activation: Literal["silu", "gelu", "gelu_tanh"], - x2: Optional[torch.Tensor] = None, - scaling_mode: str = "even", - shuffle: bool = False, - scale_shuffle_padding: bool = False, - dtype: Optional[float] = torch.bfloat16, -) -> tuple[torch.Tensor, torch.Tensor]: - """ - Apply reduction along the first dimension and apply the activation function + per-token group quantization to MX FP4 format. - If x2 is provided, the only reduction along the first dimension is applied to x2 - - Args: - if x is 3-dim, - x: (SPK, M, 2*N1), dtype = fp32. - x2: (SPK, M, 2*N1), dtype = fp32. - - if x is 2-dim, - x: (M, 2*N1), dtype = fp16 or bf16. - x2 must be None - the kernel is essentially identical to aiter.ops.triton.activation.act_mul_and_mxfp4_group_quant - - activation: activation function to apply before quantization. - - It splits the features into two parts and applies the activation to the first part. - - Then, it adds the results together before quantization. - - Supports the following activations: - - "silu" - - "gelu" - - "gelu_tanh" - - scaling_mode: The method to calculate MX block scaling. - - "even" (default): `even_round` in `quark.torch.quantization.utils`. - - etc. - shuffle: Indicates whether to enable preshuffling of scales. - - When enabled, scale dimensions (X, Y) are adjusted to be multiples of 8 and 256, respectively. - Returns: - tuple: (y, y_scale), y2 - if shuffle or scale_shuffle_padding: - y: (M_pad, N1_pad), dtype = uint8 - y_scale: (M_pad, N1_pad), dtype = uint8 - y2: (M, N2), dtype = dtype - - where M_pad = cdiv(M, 256) * 256 - N1_pad = cdiv(cdiv(N1, MXFP4_QUANT_BLOCK_SIZE), 8) * 8 - else: - y: (M, N1), dtype = uint8 - y_scale: (M, cdiv(N1, MXFP4_QUANT_BLOCK_SIZE)), dtype = uint8 - y2: (M, N2), dtype = dtype - - A tuple of (y, y_scale). - """ - _LOGGER.info( - f"ACT_MUL_MXFP4_QUANT: x={tuple(x.shape)} activation={activation} shuffle={shuffle}" - ) - - assert ( - x.dim() == 2 or x.dim() == 3 - ), "The number of dimentions for x should be 2 or 3" - X_HAS_SPLITK = False - x_num_splitk = 1 - N2 = 1 - y2 = None - if x.dim() == 3: - x_num_splitk, M, N1 = x.shape - x_num_splitk, _, N2 = x2.shape - assert ( - x.shape[0] == x2.shape[0] and x.shape[1] == x2.shape[1] - ), "The first two dimensions should be identical between x and x2" - assert ( - x_num_splitk > 1 - ), "x.shape[0] should be larger then 1 in x.dim() == 3 cases" - X_HAS_SPLITK = True - y2 = torch.empty((M, N2), dtype=dtype, device=x2.device) - else: - M, N1 = x.shape - # Activation (N/2) and storing results in uint8 (N/2) results in a feature dimension of N/4 - assert ( - N1 % 4 == 0 - ), "The last dimension for x1 should be multiple of 4 for acitvation, multiplication and mxfp4 quantization" - - MXFP4_QUANT_BLOCK_SIZE = 32 - N_half = N1 // 2 - y = torch.empty((M, N_half // 2), dtype=torch.uint8, device=x.device) - scaleN_valid = triton.cdiv(N_half, MXFP4_QUANT_BLOCK_SIZE) - # Setting scale M to be multiple of 256 and scale N to be multiple of 8 - use_scale_shuffle_padding = shuffle or scale_shuffle_padding - if use_scale_shuffle_padding: - scaleM = triton.cdiv(M, 256) * 256 - scaleN = triton.cdiv(scaleN_valid, 8) * 8 - else: - scaleM = M - scaleN = scaleN_valid - y_scale = torch.empty( - (scaleM, scaleN), - dtype=torch.uint8, - device=x.device, - ) - - NUM_ITER = 1 - NUM_WARPS = 4 - NUM_STAGES = 1 - - BLOCK_SIZE_M1 = 1 if M <= 128 else 4 - BLOCK_SIZE_M2 = 1 if M <= 128 else 4 - - # for small N values - if N_half <= 1024: - BLOCK_SIZE_N1 = 32 - else: - BLOCK_SIZE_N1 = 128 - - if N2 <= 256: - BLOCK_SIZE_N2 = 8 - elif N2 <= 1024: - BLOCK_SIZE_N2 = 32 - else: - BLOCK_SIZE_N2 = 128 - - # shuffle requires block sizes to be multiple of 32 - if shuffle: - BLOCK_SIZE_M1 = triton.cdiv(BLOCK_SIZE_M1, 32) * 32 - BLOCK_SIZE_N1 = triton.cdiv(BLOCK_SIZE_N1, 32) * 32 - - num_pid = triton.cdiv(M, BLOCK_SIZE_M1) * triton.cdiv( - N_half, BLOCK_SIZE_N1 * NUM_ITER - ) - if X_HAS_SPLITK: - num_pid += triton.cdiv(M, BLOCK_SIZE_M2) * triton.cdiv(N2, BLOCK_SIZE_N2) - - grid = (num_pid,) - _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel[grid]( - x, - y, - y_scale, - x2, - y2, - 0 if not X_HAS_SPLITK else x.stride(0), - x.stride(0) if not X_HAS_SPLITK else x.stride(1), - x.stride(1) if not X_HAS_SPLITK else x.stride(2), - y.stride(0), - y.stride(1), - y_scale.stride(0), - y_scale.stride(1), - 0 if not X_HAS_SPLITK else x2.stride(0), - 0 if not X_HAS_SPLITK else x2.stride(1), - 0 if not X_HAS_SPLITK else x2.stride(2), - 0 if not X_HAS_SPLITK else y2.stride(0), - 0 if not X_HAS_SPLITK else y2.stride(1), - M=M, - N1=N_half, - N2=N2, - BLOCK_SIZE_M1=BLOCK_SIZE_M1, - BLOCK_SIZE_N1=BLOCK_SIZE_N1, - BLOCK_SIZE_M2=BLOCK_SIZE_M2, - BLOCK_SIZE_N2=BLOCK_SIZE_N2, - NUM_ITER=NUM_ITER, - NUM_STAGES=NUM_STAGES, - MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, - SCALING_MODE=0, - ACTIVATION=_get_activation_from_str(activation) if activation else "", - scaleN=scaleN_valid, - scaleM_pad=(scaleM if use_scale_shuffle_padding else 1), - scaleN_pad=scaleN, - SHUFFLE=shuffle, - X_HAS_SPLITK=X_HAS_SPLITK, - X_NUM_KSPLIT=x_num_splitk, - X_NUM_KSPLIT_POW2=triton.next_power_of_2(x_num_splitk), - num_warps=NUM_WARPS, - waves_per_eu=0, - num_stages=1, - ) - - return (y, y_scale), y2 diff --git a/aiter/ops/triton/gemm_a16wfp4.py b/aiter/ops/triton/gemm_a16wfp4.py deleted file mode 100644 index 40744fba68..0000000000 --- a/aiter/ops/triton/gemm_a16wfp4.py +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -from typing import Optional -import torch -import triton -import triton.language as tl -import aiter.ops.triton.utils._triton.arch_info as arch_info -from aiter.ops.triton.quant import _mxfp4_quant_op -from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton._triton_kernels.gemm_a16wfp4 import ( - _gemm_a16wfp4_kernel, - _get_config, -) -from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( - _gemm_afp4wfp4_reduce_kernel, -) -from aiter.ops.triton.gemm_afp4wfp4 import ( - get_splitk, -) - - -_LOGGER = AiterTritonLogger() - - -def gemm_a16wfp4( - x, - w, - w_scales, - atomic_add: bool = False, - dtype: Optional[float] = torch.bfloat16, - y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, -): - """ - Computes the matmul Y = X x W - W is an e2m1 fp4 tensor and w_scales is an e8m0 tensor. - Every 32 elements in the K dimension share one e8m0 scale. - X gets quantized to the microscale fp4 (mxfp4) format before the GEMM. - - - Key parameters: - - X: Matrix X with shape (M, K). - - W: Matrix W with shape (N, K). - - W_scales: Matrix with shape (N, K // 32) - - Returns: - - Y: The output matrix with shape (M, N). - """ - - _LOGGER.info( - f"GEMM_A16WFP4: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w_scales.shape)} " - ) - - assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" - - M, K = x.shape - N, K = w.shape - - # inner kernel expects (K, N) - w = w.T - - if config is None: - config = _get_config(M, N, K) - - if y is None: - if atomic_add: - y = torch.zeros((M, N), dtype=dtype, device=x.device) - else: - y = torch.empty((M, N), dtype=dtype, device=x.device) - - if config["NUM_KSPLIT"] > 1 and not atomic_add: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - - if config["BLOCK_SIZE_K"] >= 2 * K: - config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) - config["SPLITK_BLOCK_SIZE"] = 2 * K - config["NUM_KSPLIT"] = 1 - config["BLOCK_SIZE_K"] = max(config["BLOCK_SIZE_K"], 64) - - if config["NUM_KSPLIT"] > 1 and not atomic_add: - y_pp = torch.empty( - (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device - ) - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - y_pp = None - - grid = lambda META: ( # noqa: E731 - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * triton.cdiv(N, META["BLOCK_SIZE_N"]) - ), - ) - _gemm_a16wfp4_kernel[grid]( - x, - w, - y if y_pp is None else y_pp, - w_scales, - M, - N, - K, - x.stride(0), - x.stride(1), - w.stride(0), - w.stride(1), - 0 if y_pp is None else y_pp.stride(0), - y.stride(0) if y_pp is None else y_pp.stride(1), - y.stride(1) if y_pp is None else y_pp.stride(2), - w_scales.stride(0), - w_scales.stride(1), - ATOMIC_ADD=atomic_add, - **config, - ) - - if config["NUM_KSPLIT"] > 1 and not atomic_add: - REDUCE_BLOCK_SIZE_M = 16 - REDUCE_BLOCK_SIZE_N = 64 - # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails - # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and - # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials - ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) - - grid_reduce = ( - triton.cdiv(M, REDUCE_BLOCK_SIZE_M), - triton.cdiv(N, REDUCE_BLOCK_SIZE_N), - ) - _gemm_afp4wfp4_reduce_kernel[grid_reduce]( - y_pp, - y, - M, - N, - y_pp.stride(0), - y_pp.stride(1), - y_pp.stride(2), - y.stride(0), - y.stride(1), - REDUCE_BLOCK_SIZE_M, - REDUCE_BLOCK_SIZE_N, - ACTUAL_KSPLIT, - triton.next_power_of_2(config["NUM_KSPLIT"]), - ) - - return y diff --git a/aiter/ops/triton/gemm_afp4wfp4.py b/aiter/ops/triton/gemm_afp4wfp4.py index 820f2c5105..a5353b9051 100644 --- a/aiter/ops/triton/gemm_afp4wfp4.py +++ b/aiter/ops/triton/gemm_afp4wfp4.py @@ -8,10 +8,10 @@ import aiter.ops.triton.utils._triton.arch_info as arch_info from aiter.ops.triton.utils.logger import AiterTritonLogger from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( - _gemm_afp4wfp4_kernel, - _gemm_afp4wfp4_kernel_preshuffle_scales, - _gemm_afp4wfp4_preshuffle_kernel, - _gemm_afp4wfp4_reduce_kernel, + _gemm_afp4_wfp4_kernel, + _gemm_afp4_wfp4_kernel_preshuffled_scales, + _gemm_afp4_wfp4_kernel_preshuffled_weight_scales, + _gemm_afp4_wfp4_reduce_kernel, _get_config, ) from .utils.core import AITER_TRITON_CONFIGS_PATH @@ -138,7 +138,7 @@ def gemm_afp4wfp4( ), ) - _gemm_afp4wfp4_kernel[grid]( + _gemm_afp4_wfp4_kernel[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -173,7 +173,7 @@ def gemm_afp4wfp4( triton.cdiv(M, REDUCE_BLOCK_SIZE_M), triton.cdiv(N, REDUCE_BLOCK_SIZE_N), ) - _gemm_afp4wfp4_reduce_kernel[grid_reduce]( + _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( y_pp, y, M, @@ -272,7 +272,7 @@ def gemm_afp4wfp4_preshuffled_scales( ), ) - _gemm_afp4wfp4_kernel_preshuffled_scales[grid]( + _gemm_afp4_wfp4_kernel_preshuffled_scales[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -307,7 +307,7 @@ def gemm_afp4wfp4_preshuffled_scales( triton.cdiv(M, REDUCE_BLOCK_SIZE_M), triton.cdiv(N, REDUCE_BLOCK_SIZE_N), ) - _gemm_afp4wfp4_reduce_kernel[grid_reduce]( + _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( y_pp, y, M, @@ -326,7 +326,7 @@ def gemm_afp4wfp4_preshuffled_scales( return y -def gemm_afp4wfp4_preshuffle( +def gemm_afp4wfp4_preshuffled_weight_scales( x, w, x_scales, @@ -417,13 +417,13 @@ def gemm_afp4wfp4_preshuffle( M_POW2 = triton.next_power_of_2(M) if M < 32 and M_POW2 > 16: M_POW2 = 16 - metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_gemm_afp4wfp4_preshuffle_kernel.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" + metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" if use_aot and os.path.exists(metadata_pth): with AOTMetadataContext( - _gemm_afp4wfp4_preshuffle_kernel.fn.__name__, + _gemm_afp4_wfp4_kernel_preshuffled_weight_scales.fn.__name__, f"{metadata_pth}", ): - _gemm_afp4wfp4_preshuffle_kernel[grid]( + _gemm_afp4_wfp4_kernel_preshuffled_weight_scales[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -446,7 +446,7 @@ def gemm_afp4wfp4_preshuffle( **config, ) else: - _gemm_afp4wfp4_preshuffle_kernel[grid]( + _gemm_afp4_wfp4_kernel_preshuffled_weight_scales[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -481,7 +481,7 @@ def gemm_afp4wfp4_preshuffle( triton.cdiv(M, REDUCE_BLOCK_SIZE_M), triton.cdiv(N, REDUCE_BLOCK_SIZE_N), ) - _gemm_afp4wfp4_reduce_kernel[grid_reduce]( + _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( y_pp, y, M, @@ -498,19 +498,3 @@ def gemm_afp4wfp4_preshuffle( ) return y - - -def gemm_afp4wfp4_preshuffled_weight_scales( - x, - w, - x_scales, - w_scales, - dtype: Optional[float] = torch.bfloat16, - y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, - use_aot: Optional[bool] = True, -): - _LOGGER.info( - "gemm_afp4wfp4_preshuffled_weight_scales will be deprecated in future AITER release, please switch to gemm_afp4wfp4_preshuffle" - ) - return gemm_afp4wfp4_preshuffle(x, w, x_scales, w_scales, dtype, y, config, use_aot) diff --git a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py index d3738fd4aa..94369cc2c8 100644 --- a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py +++ b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py @@ -5,9 +5,12 @@ import torch import triton import triton.language as tl +import aiter.ops.triton.utils._triton.arch_info as arch_info +from aiter.ops.triton.quant import _mxfp4_quant_op from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton.gemm_a16wfp4 import ( - gemm_a16wfp4, +from aiter.ops.triton._triton_kernels.gemm_afp4wfp4_pre_quant_atomic import ( + _gemm_afp4_wfp4_pre_quant_kernel, + _get_config, ) _LOGGER = AiterTritonLogger() @@ -21,7 +24,69 @@ def gemm_afp4wfp4_pre_quant( y: Optional[torch.Tensor] = None, config: Optional[dict] = None, ): + """ + Computes matrix multiplication Y = X @ W^T with on-the-fly FP4 quantization of activations. + X is quantized to MXFP4 during computation, W is pre-quantized FP4. Uses atomic operations for split-K reduction. + + Args: + x (torch.Tensor): Higher precision input matrix with shape (M, K) (BF16 or FP16). + Quantized to FP4 E2M1 on-the-fly during GEMM. + w (torch.Tensor): FP4 E2M1 weight matrix with shape (N, K), internally transposed. + w_scales (torch.Tensor): E8M0 per-group scale for w with shape (N, K//32). + One scale per 32 elements in K dimension. + dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). + y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). + Must be zero-initialized for atomic operations. + config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT). + + Returns: + torch.Tensor: Output with shape (M, N). + """ + _LOGGER.info( - "gemm_afp4wfp4_pre_quant will be deprecated in future AITER release, please switch to gemm_a16wfp4" + f"GEMM_AFP4WFP4_PRE_QUANT_ATOMIC: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w_scales.shape)} " + ) + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + M, K = x.shape + N, K = w.shape + + # inner kernel expects (K, N) + w = w.T + + if y is None: + y = torch.zeros((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _gemm_afp4_wfp4_pre_quant_kernel[grid]( + x, + w, + y, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0, + y.stride(0), + y.stride(1), + w_scales.stride(0), + w_scales.stride(1), + **config, ) - return gemm_a16wfp4(x, w, w_scales, True, dtype, y, config) + + return y diff --git a/op_tests/op_benchmarks/triton/bench_batched_gemm_a16wfp4.py b/op_tests/op_benchmarks/triton/bench_batched_gemm_afp4wfp4_pre_quant.py similarity index 97% rename from op_tests/op_benchmarks/triton/bench_batched_gemm_a16wfp4.py rename to op_tests/op_benchmarks/triton/bench_batched_gemm_afp4wfp4_pre_quant.py index fd92be7a0e..16a0c8805f 100644 --- a/op_tests/op_benchmarks/triton/bench_batched_gemm_a16wfp4.py +++ b/op_tests/op_benchmarks/triton/bench_batched_gemm_afp4wfp4_pre_quant.py @@ -2,8 +2,8 @@ import torch import triton import math -from aiter.op_tests.triton_tests.test_batched_gemm_a16wfp4 import ( - generate_batched_gemm_a16wfp4_inputs, +from op_tests.triton_tests.test_batched_gemm_afp4wfp4_pre_quant import ( + generate_batched_gemm_afp4wfp4_pre_quant_inputs, ) from op_tests.op_benchmarks.triton.utils.argparse import ( get_parser, diff --git a/op_tests/triton_tests/test_batched_gemm_a16wfp4.py b/op_tests/triton_tests/test_batched_gemm_afp4wfp4_pre_quant.py similarity index 92% rename from op_tests/triton_tests/test_batched_gemm_a16wfp4.py rename to op_tests/triton_tests/test_batched_gemm_afp4wfp4_pre_quant.py index 7fe8df7e7b..09094f513e 100755 --- a/op_tests/triton_tests/test_batched_gemm_a16wfp4.py +++ b/op_tests/triton_tests/test_batched_gemm_afp4wfp4_pre_quant.py @@ -1,7 +1,7 @@ import torch import pytest -from aiter.ops.triton.batched_gemm_a16wfp4 import ( - batched_gemm_a16wfp4, +from aiter.ops.triton.batched_gemm_afp4wfp4_pre_quant import ( + batched_gemm_afp4wfp4_pre_quant, ) import aiter.ops.triton.utils._triton.arch_info as arch_info @@ -9,7 +9,9 @@ SCALE_GROUP_SIZE = 32 -def generate_batched_gemm_a16wfp4_inputs(B, M, N, K, dtype, layout="TN", output=False): +def generate_batched_gemm_afp4wfp4_pre_quant_inputs( + B, M, N, K, dtype, layout="TN", output=False +): """ Returns: - x: (B, M, K) @@ -173,18 +175,20 @@ def run_torch(x, w, w_scales, dtype): @pytest.mark.parametrize("B, M, N, K", get_x_vals()) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("layout", ["TN", "TT", "NN", "NT"]) -def test_batched_gemm_a16wfp4(B: int, M: int, N: int, K: int, layout, dtype): +def test_batched_gemm_afp4_wfp4_pre_quant( + B: int, M: int, N: int, K: int, layout, dtype +): if not (arch_info.is_fp4_avail()): pytest.skip("MXFP4 not supported on this architecture") torch.cuda.empty_cache() # Helps avoid hangs in large tests - x, w, x_scales, w_scales, out = generate_batched_gemm_a16wfp4_inputs( + x, w, x_scales, w_scales, out = generate_batched_gemm_afp4wfp4_pre_quant_inputs( B, M, N, K, dtype, layout=layout, output=True ) torch_out = run_torch(x, w, w_scales, dtype).to(dtype) - batched_gemm_a16wfp4(x, w, w_scales, dtype, out, transpose_bm=False, prequant=True) + batched_gemm_afp4wfp4_pre_quant(x, w, w_scales, dtype, out) torch.testing.assert_close(torch_out, out) diff --git a/op_tests/triton_tests/test_fused_gemm_afp4wfp4_a16w16.py b/op_tests/triton_tests/test_fused_gemm_afp4wfp4_a16w16.py deleted file mode 100644 index aecc1cede6..0000000000 --- a/op_tests/triton_tests/test_fused_gemm_afp4wfp4_a16w16.py +++ /dev/null @@ -1,149 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -import torch -import triton -import pytest -from aiter.ops.triton.fused_gemm_afp4wfp4_a16w16 import ( - fused_gemm_afp4wfp4_a16w16, -) -from op_tests.triton_tests.test_gemm_afp4wfp4 import ( - generate_gemm_afp4wfp4_inputs, -) -from op_tests.triton_tests.test_gemm_afp4wfp4 import run_torch as run_torch_fp4 -from op_tests.triton_tests.test_gemm_a16w16 import generate_gemm_a16w16_inputs -import torch.nn.functional as F -import aiter.ops.triton.utils._triton.arch_info as arch_info - - -def run_torch( - x_fp4, - w_fp4, - x_fp4_scale, - w_fp4_scale, - x_bf16, - w_bf16, - bias_fp4, - bias_bf16, - dtype=torch.bfloat16, -): - y_fp4 = run_torch_fp4(x_fp4, w_fp4, x_fp4_scale, w_fp4_scale, dtype) - if bias_fp4 is not None: - y_fp4 += bias_fp4 - y_bf16 = F.linear(x_bf16, w_bf16, bias=bias_bf16) - return y_fp4.to(dtype), y_bf16.to(dtype) - - -def run_triton( - x_fp4, - w_fp4, - x_fp4_scale, - w_fp4_scale, - x_bf16, - w_bf16, - bias_fp4, - bias_bf16, - dtype=torch.bfloat16, - y_fp4=None, - y_bf16=None, - skip_reduce=False, - is_fp4_preshuffled=True, -): - return fused_gemm_afp4wfp4_a16w16( - x_fp4, - w_fp4, - x_fp4_scale, - w_fp4_scale, - x_bf16, - w_bf16, - is_fp4_preshuffled=is_fp4_preshuffled, - bias_fp4=bias_fp4, - bias_bf16=bias_bf16, - dtype=dtype, - y_fp4=y_fp4, - y_bf16=y_bf16, - skip_reduce=skip_reduce, - ) - - -def get_x_vals(): - - x_vals = [ - (m, n1, n2, k) - for k in [1024, 8192, 7168] - for n2 in [256, 512] - for n1 in [256, 512] - for m in [1, 8, 32, 64, 128, 8192] - ] - return x_vals - - -@pytest.mark.parametrize("M, N1, N2, K", get_x_vals()) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("output", [True, False]) -@pytest.mark.parametrize("skip_reduce", [True, False]) -@pytest.mark.parametrize("fp4_shuffle", [True, False]) -def test_gemm(dtype, M, N1, N2, K, output, skip_reduce, fp4_shuffle): - - if not (arch_info.is_fp4_avail()): - pytest.skip("MXFP4 not supported on this architecture") - - ( - x_fp4, - w_fp4, - w_fp4_triton, - x_fp4_scale, - w_fp4_scale, - x_fp4_scale_triton, - w_fp4_scale_triton, - out_dtype, - y_fp4, - ) = generate_gemm_afp4wfp4_inputs( - M, - N1, - K, - dtype, - layout="TN", - output=output, - shuffle_scales_fg=fp4_shuffle, - shuffle_weight_fg=fp4_shuffle, - ) - - x_bf16, w_bf16, bias_bf16, _, y_bf16 = generate_gemm_a16w16_inputs( - M, N2, K, dtype, output=output, bias=True - ) - bias_bf16 = torch.randn((N2,), dtype=bias_bf16.dtype, device=bias_bf16.device) - bias_fp4 = torch.randn((N1,), dtype=bias_bf16.dtype, device=bias_bf16.device) - y_torch_fp4, y_torch_bf16 = run_torch( - x_fp4, - w_fp4, - x_fp4_scale, - w_fp4_scale, - x_bf16, - w_bf16, - bias_fp4, - bias_bf16, - dtype, - ) - y_triton_fp4, y_triton_bf16 = run_triton( - x_fp4, - w_fp4_triton, - x_fp4_scale_triton, - w_fp4_scale_triton, - x_bf16, - w_bf16, - bias_fp4, - bias_bf16, - dtype, - y_fp4, - y_bf16, - skip_reduce=skip_reduce, - is_fp4_preshuffled=fp4_shuffle, - ) - - if y_triton_fp4.dim() == 3: - y_triton_fp4 = y_triton_fp4.sum(axis=0).to(dtype=dtype) - y_triton_bf16 = y_triton_bf16.sum(axis=0).to(dtype=dtype) - - triton.testing.assert_close(y_torch_bf16, y_triton_bf16, atol=0.1, rtol=1e-1) - triton.testing.assert_close(y_torch_fp4, y_triton_fp4, atol=0.1, rtol=1e-1) diff --git a/op_tests/triton_tests/test_fused_gemm_afp4wfp4_mul_add.py b/op_tests/triton_tests/test_fused_gemm_afp4wfp4_mul_add.py deleted file mode 100644 index e75eeed64f..0000000000 --- a/op_tests/triton_tests/test_fused_gemm_afp4wfp4_mul_add.py +++ /dev/null @@ -1,158 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -import pytest -import os -import torch -from aiter.ops.triton.fused_gemm_afp4wfp4_mul_add import ( - fused_gemm_afp4wfp4_mul_add, - fused_gemm_afp4wfp4_preshuffle_add_mul, -) -import aiter.ops.triton.utils._triton.arch_info as arch_info -from op_tests.triton_tests.test_gemm_afp4wfp4 import generate_gemm_afp4wfp4_inputs -from op_tests.triton_tests.test_gemm_afp4wfp4 import ( - run_torch as run_torch_gemm_afp4wfp4, -) -from op_tests.triton_tests.test_fused_mul_add import generate_fused_mul_add_inputs -from op_tests.triton_tests.test_fused_mul_add import ( - run_torch as run_torch_fused_mul_add, -) - - -def get_x_vals(): - - x_vals = [(1024 * v, 1024 * v, 1024 * v) for v in range(1, 9)] - x_vals += [(1, 1, 32)] # minimal case - return x_vals - - -@pytest.mark.parametrize("M, N, K", get_x_vals()) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("layout", ["TN"]) -@pytest.mark.parametrize("output", [True, False]) -@pytest.mark.parametrize( - "shuffle_weight_scales", - [True, False], -) -@pytest.mark.parametrize( - "a_type_is_scalar", - [(float, True), (int, True), (torch.Tensor, True), (torch.Tensor, False)], -) -@pytest.mark.parametrize( - "b_type_is_scalar", - # [(float, True), (int, True), (torch.Tensor, True), (torch.Tensor, False)], - [(torch.Tensor, False)], -) -@pytest.mark.parametrize( - "fuse_type", - [0, 1], -) -def test_fused_gemm_afp4wfp4_mul_add( - M: int, - N: int, - K: int, - dtype, - layout, - output, - shuffle_weight_scales, - a_type_is_scalar, - b_type_is_scalar, - fuse_type, -): - if not (arch_info.is_fp4_avail()): - pytest.skip("MXFP4 not supported on this architecture") - - if shuffle_weight_scales: - if N % 32 > 0: - pytest.skip( - f"N = {N} is not divisible by 32, skip this test for preshuffled weight/scales tests" - ) - elif K % 256 > 0: - pytest.skip( - f"K = {K} is not divisible by 256, skip this test for preshuffled weight/scales tests" - ) - - ( - x, - w, - w_triton, - x_scales, - w_scales, - x_scales_triton, - w_scales_triton, - out_dtype, - y, - ) = generate_gemm_afp4wfp4_inputs( - M, - N, - K, - dtype, - layout=layout, - output=output, - shuffle_scales_fg=shuffle_weight_scales, - shuffle_weight_fg=shuffle_weight_scales, - ) - _, a, b = generate_fused_mul_add_inputs( - [M, N], a_type_is_scalar, b_type_is_scalar, dtype - ) - - if fuse_type == 0: - torch_out = run_torch_fused_mul_add( - run_torch_gemm_afp4wfp4(x, w, x_scales, w_scales, torch.float32), a, b - ).to(dtype) - else: - torch_out = run_torch_fused_mul_add( - b, a, run_torch_gemm_afp4wfp4(x, w, x_scales, w_scales, torch.float32) - ).to(dtype) - - if shuffle_weight_scales: - if output: - triton_out = fused_gemm_afp4wfp4_preshuffle_add_mul( - x, - w_triton, - x_scales_triton, - w_scales_triton, - a, - b, - dtype, - y, - use_aot=(dtype == torch.bfloat16 and layout == "TN"), - fuse_type=fuse_type, - ) - else: - triton_out = fused_gemm_afp4wfp4_preshuffle_add_mul( - x, - w_triton, - x_scales_triton, - w_scales_triton, - a, - b, - dtype, - use_aot=(dtype == torch.bfloat16 and layout == "TN"), - fuse_type=fuse_type, - ) - else: - if output: - triton_out = fused_gemm_afp4wfp4_mul_add( - x, - w_triton, - x_scales_triton, - w_scales_triton, - a, - b, - dtype, - y, - fuse_type=fuse_type, - ) - else: - triton_out = fused_gemm_afp4wfp4_mul_add( - x, - w_triton, - x_scales_triton, - w_scales_triton, - a, - b, - dtype, - fuse_type=fuse_type, - ) - - torch.testing.assert_close(torch_out, triton_out, atol=0.1, rtol=0.1) diff --git a/op_tests/triton_tests/test_fused_mul_add.py b/op_tests/triton_tests/test_fused_mul_add.py index fdd8d783c6..a8ffe86791 100644 --- a/op_tests/triton_tests/test_fused_mul_add.py +++ b/op_tests/triton_tests/test_fused_mul_add.py @@ -3,7 +3,7 @@ from aiter.ops.triton.fused_mul_add import fused_mul_add -def generate_fused_mul_add_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype): +def generate_qk_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype): x = torch.randn(*shape, dtype=dtype, device="cuda") if a_type_is_scalar[1]: @@ -29,7 +29,7 @@ def generate_fused_mul_add_inputs(shape, a_type_is_scalar, b_type_is_scalar, dty return x, a, b -def run_torch(x, a, b): +def ref_mul_add(x, a, b): return (a * x.to(torch.float32) + b).to(x.dtype) @@ -50,11 +50,9 @@ def test_mul_add(shape, a_type_is_scalar, b_type_is_scalar, output: bool, dtype) torch.cuda.empty_cache() # Helps avoid hangs in large tests - x, a, b = generate_fused_mul_add_inputs( - shape, a_type_is_scalar, b_type_is_scalar, dtype - ) + x, a, b = generate_qk_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype) - x_torch = run_torch(x, a, b).clone() + x_torch = ref_mul_add(x, a, b).clone() if output: x_triton = torch.empty_like(x) fused_mul_add(x, a, b, x_triton) diff --git a/op_tests/triton_tests/test_fused_mxfp4_quant.py b/op_tests/triton_tests/test_fused_mxfp4_quant.py index 0ee8daa171..8c66a8aa26 100644 --- a/op_tests/triton_tests/test_fused_mxfp4_quant.py +++ b/op_tests/triton_tests/test_fused_mxfp4_quant.py @@ -1,10 +1,8 @@ import torch -import torch.nn.functional as F import pytest from aiter.ops.triton.fused_mxfp4_quant import ( fused_flatten_mxfp4_quant, fused_rms_mxfp4_quant, - fused_reduce_act_mul_and_mxfp4_quant, ) from op_tests.triton_tests.test_quant_mxfp4 import torch_dynamic_mxfp4_quant from op_tests.triton_tests.test_gemm_afp4wfp4 import ( @@ -13,7 +11,6 @@ SCALE_GROUP_SIZE, ) from op_tests.triton_tests.test_gemm_afp4wfp4 import shuffle_scales, un_shuffle_scales -import aiter.ops.triton.utils._triton.arch_info as arch_info torch.manual_seed(0) @@ -36,7 +33,6 @@ def calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, eps=1e-6, shuffle=F x1 = res1_out = x1 + resid1 res1_out = res1_out.to(orig_dtype) x1 = rmsnorm(x1, rms1_w, eps) - out1 = x1.to(orig_dtype) out1_fp4, out1_scale = torch_dynamic_mxfp4_quant(x1) out2 = None @@ -59,7 +55,7 @@ def calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, eps=1e-6, shuffle=F out1_scale = shuffle_scales(out1_scale_pad) out1_scale = out1_scale.view(out1_scale.shape[0] * 32, -1) - return (out1_fp4, out1_scale), out1, out2, res1_out + return (out1_fp4, out1_scale), out2, res1_out def convert_mxfp4_to_fp32(x, x_scales): @@ -100,10 +96,6 @@ def generate_fused_rms_quant_data( @pytest.mark.parametrize("N", [32, 64, 128]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) def test_flatten_quant(B: int, M: int, N: int, dtype): - - if not (arch_info.is_fp4_avail()): - pytest.skip("MXFP4 not supported on this architecture") - torch.cuda.empty_cache() # Helps avoid hangs in large tests x = torch.randn((B, M, N), dtype=dtype, device="cuda").transpose(0, 1) @@ -143,10 +135,6 @@ def test_fused_rms_quant( shuffle: bool, scale_shuffle_padding: bool, ): - - if not (arch_info.is_fp4_avail()): - pytest.skip("MXFP4 not supported on this architecture") - torch.cuda.empty_cache() # Helps avoid hangs in large tests x1, x2, rms1_w, rms2_w, resid1 = generate_fused_rms_quant_data( x1_shape=(M, N1), @@ -157,11 +145,11 @@ def test_fused_rms_quant( res1=res1, dtype=dtype, ) - (y1_fp4_torch, y1_scales_torch), y1_torch, y2_torch, y1_res_torch = ( + (x1_fp4_torch, x1_scales_torch), x2_torch, res1_out_torch = ( calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, shuffle=shuffle) ) - (y1_fp4_triton, y1_scales_triton), y1_triton, y2_triton, y1_res_triton = ( + (x1_fp4_triton, x1_scales_triton), x2_triton, res1_out_triton = ( fused_rms_mxfp4_quant( x1, rms1_w, @@ -172,152 +160,28 @@ def test_fused_rms_quant( resid1, shuffle=shuffle, scale_shuffle_padding=scale_shuffle_padding, - output_unquantized_inp1=True, ) ) - if y1_triton is not None: - torch.testing.assert_close(y1_torch, y1_triton) - if shuffle: - y1_scales_triton = un_shuffle_scales( - y1_scales_triton.view(y1_scales_triton.shape[0] // 32, -1) + x1_scales_triton = un_shuffle_scales( + x1_scales_triton.view(x1_scales_triton.shape[0] // 32, -1) ) - y1_scales_torch = un_shuffle_scales( - y1_scales_torch.view(y1_scales_torch.shape[0] // 32, -1) + x1_scales_torch = un_shuffle_scales( + x1_scales_torch.view(x1_scales_torch.shape[0] // 32, -1) ) scaleN_valid = (N1 + 31) // 32 - y1_scales_triton = y1_scales_triton[:M, :scaleN_valid] - y1_scales_torch = y1_scales_torch[:M, :scaleN_valid] - - if y2_triton is not None: - torch.testing.assert_close(y2_torch, y2_triton) - - if y1_res_triton is not None: - torch.testing.assert_close(y1_res_torch, y1_res_triton) - - y1_fp32_torch = convert_mxfp4_to_fp32(y1_fp4_torch, y1_scales_torch) - y1_fp32_triton = convert_mxfp4_to_fp32(y1_fp4_triton, y1_scales_triton) - - torch.testing.assert_close(y1_fp32_torch, y1_fp32_triton) - - -def run_torch_reduce_act_mul_mxfp4_group_quant(x, x2, activation, dtype, shuffle): - x = x.to(torch.float32) - d = x.shape[-1] // 2 - y2 = None - if x.dim() == 3: - x = x.sum(axis=0) - y2 = x2.sum(axis=0).to(dtype=dtype) - else: - assert x2 is None, "x2 must be None in x.dim() == 2 cases" - x, x_mul = x.split([d, d], dim=-1) - if activation == "silu": - out = F.silu(x) * x_mul - elif activation == "gelu": - out = F.gelu(x) * x_mul - out, out_scale = torch_dynamic_mxfp4_quant(out) - if shuffle: - # out_scale_pad = out_scale - M = out_scale.shape[0] - N = out.shape[1] * 2 - scaleM = (M + 255) // 256 * 256 - scaleN_valid = (N + 31) // 32 - scaleN = (scaleN_valid + 7) // 8 * 8 - out_scale_pad = torch.empty( - (scaleM, scaleN), dtype=out_scale.dtype, device=out_scale.device - ) - out_scale_pad[:M, :scaleN] = out_scale[:M, :scaleN] - out_scale = shuffle_scales(out_scale_pad) - out_scale = out_scale.view(out_scale.shape[0] * 32, -1) - return (out, out_scale), y2 - - -def generate_fused_reduce_act_mul_mxfp4_group_quant( - M: int, - N1: int, - dtype=torch.bfloat16, - SPK: int = 1, - N2: int = 1, -): - if SPK == 1: - x = torch.randn((M, N1 * 2), dtype=dtype).cuda() / 10 - else: - x = torch.randn((SPK, M, N1 * 2), dtype=torch.float32).cuda() / 10 - x2 = None - if SPK > 1: - x2 = torch.randn((SPK, M, N2), dtype=torch.float32).cuda() / 10 + x1_scales_triton = x1_scales_triton[:M, :scaleN_valid] + x1_scales_torch = x1_scales_torch[:M, :scaleN_valid] - return x, x2 - - -@pytest.mark.parametrize( - "M, N1, N2", - [ - (1, 256, 256), - (2, 256, 256), - (4, 256, 256), - (32, 256, 256), - (1, 4, 256), - (1, 28, 256), - (1, 32, 256), - (1, 64, 256), - (1, 68, 256), - (128, 28, 256), - (128, 32, 256), - (128, 64, 256), - (128, 68, 256), - (256, 32, 256), - ], -) -@pytest.mark.parametrize("SPK", [1, 4]) -@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) -@pytest.mark.parametrize("activation", ["silu", "gelu"]) -@pytest.mark.parametrize("shuffle", [False, True]) -@pytest.mark.parametrize("scale_shuffle_padding", [False, True]) -def test_fused_reduce_act_mul_mxfp4_group_quant( - M: int, - N1: int, - N2: int, - SPK: int, - dtype, - activation: str, - shuffle: bool, - scale_shuffle_padding: bool, -): - if not (arch_info.is_fp4_avail()): - pytest.skip("MXFP4 not supported on this architecture") - - if shuffle and (N1 * 2) % 512 != 0: - pytest.skip() - - x, x2 = generate_fused_reduce_act_mul_mxfp4_group_quant( - M, N1, dtype=dtype, SPK=SPK, N2=N2 - ) - - (y_q_torch, y_s_torch), y2_torch = run_torch_reduce_act_mul_mxfp4_group_quant( - x, x2, activation, dtype=dtype, shuffle=shuffle - ) - - (y_q_triton, y_s_triton), y2_triton = fused_reduce_act_mul_and_mxfp4_quant( - x, - activation=activation, - x2=x2, - shuffle=shuffle, - scale_shuffle_padding=scale_shuffle_padding, - dtype=dtype, - ) - - if shuffle: - y_s_triton = un_shuffle_scales(y_s_triton.view(y_s_triton.shape[0] // 32, -1)) - y_s_torch = un_shuffle_scales(y_s_torch.view(y_s_torch.shape[0] // 32, -1)) + if x2_triton is not None: + torch.testing.assert_close(x2_torch, x2_triton) - torch.testing.assert_close(y2_torch, y2_triton, atol=0.1, rtol=0.1) + if res1_out_triton is not None: + torch.testing.assert_close(res1_out_torch, res1_out_triton) - scaleN_valid = (N1 // 2 + 31) // 32 - y_s_triton = y_s_triton[:M, :scaleN_valid] - y_s_torch = y_s_torch[:M, :scaleN_valid] + res_fp32_torch = convert_mxfp4_to_fp32(x1_fp4_torch, x1_scales_torch) + res_fp32_triton = convert_mxfp4_to_fp32(x1_fp4_triton, x1_scales_triton) - torch.testing.assert_close(y_q_triton, y_q_torch) - torch.testing.assert_close(y_s_triton, y_s_torch) + torch.testing.assert_close(res_fp32_torch, res_fp32_triton) diff --git a/op_tests/triton_tests/test_gemm_afp4wfp4.py b/op_tests/triton_tests/test_gemm_afp4wfp4.py index 98325c8e89..7f79d2c536 100644 --- a/op_tests/triton_tests/test_gemm_afp4wfp4.py +++ b/op_tests/triton_tests/test_gemm_afp4wfp4.py @@ -5,7 +5,8 @@ import torch from aiter.ops.triton.gemm_afp4wfp4 import ( gemm_afp4wfp4, - gemm_afp4wfp4_preshuffle, + gemm_afp4wfp4_preshuffled_scales, + gemm_afp4wfp4_preshuffled_weight_scales, ) import aiter.ops.triton.utils._triton.arch_info as arch_info from aiter.ops.triton.utils.types import str_to_torch_dtype @@ -229,16 +230,22 @@ def run_torch(x, w, x_scales, w_scales, dtype): @pytest.mark.parametrize("layout", ["TN", "TT", "NN", "NT"]) @pytest.mark.parametrize("output", [True, False]) @pytest.mark.parametrize( - "shuffle_weight_scales", - [True, False], + "shuffle_scales_fg, shuffle_weight_fg", + [(False, False), (True, False), (True, True)], ) def test_gemm_afp4_wfp4( - M: int, N: int, K: int, dtype, layout, output, shuffle_weight_scales + M: int, N: int, K: int, dtype, layout, output, shuffle_scales_fg, shuffle_weight_fg ): if not (arch_info.is_fp4_avail()): pytest.skip("MXFP4 not supported on this architecture") - if shuffle_weight_scales: + if shuffle_weight_fg and not shuffle_scales_fg: + pytest.skip("Preshuffling weight without preshuffled scales is not supported") + + if shuffle_weight_fg or shuffle_scales_fg: + if shuffle_scales_fg and not shuffle_weight_fg and M < 32: + pytest.skip("Minimal tile size for preshuffled scales is 32x32x256") + if N % 32 > 0: pytest.skip( f"N = {N} is not divisible by 32, skip this test for preshuffled weight/scales tests" @@ -265,15 +272,15 @@ def test_gemm_afp4_wfp4( dtype, layout=layout, output=output, - shuffle_scales_fg=shuffle_weight_scales, - shuffle_weight_fg=shuffle_weight_scales, + shuffle_scales_fg=shuffle_scales_fg, + shuffle_weight_fg=shuffle_weight_fg, ) torch_out = run_torch(x, w, x_scales, w_scales, dtype).to(dtype) - if shuffle_weight_scales: + if shuffle_scales_fg and shuffle_weight_fg: if output: - triton_out = gemm_afp4wfp4_preshuffle( + triton_out = gemm_afp4wfp4_preshuffled_weight_scales( x, w_triton, x_scales_triton, @@ -283,7 +290,7 @@ def test_gemm_afp4_wfp4( use_aot=(dtype == torch.bfloat16 and layout == "TN"), ) else: - triton_out = gemm_afp4wfp4_preshuffle( + triton_out = gemm_afp4wfp4_preshuffled_weight_scales( x, w_triton, x_scales_triton, @@ -291,15 +298,15 @@ def test_gemm_afp4_wfp4( dtype, use_aot=(dtype == torch.bfloat16 and layout == "TN"), ) - # TODO: remove in the future - # if output: - # triton_out = gemm_afp4wfp4_preshuffled_scales( - # x, w_triton, x_scales_triton, w_scales_triton, dtype, y - # ) - # else: - # triton_out = gemm_afp4wfp4_preshuffled_scales( - # x, w_triton, x_scales_triton, w_scales_triton, dtype - # ) + elif shuffle_scales_fg and not shuffle_weight_fg: + if output: + triton_out = gemm_afp4wfp4_preshuffled_scales( + x, w_triton, x_scales_triton, w_scales_triton, dtype, y + ) + else: + triton_out = gemm_afp4wfp4_preshuffled_scales( + x, w_triton, x_scales_triton, w_scales_triton, dtype + ) else: if output: triton_out = gemm_afp4wfp4( diff --git a/op_tests/triton_tests/test_gemm_a16wfp4.py b/op_tests/triton_tests/test_gemm_afp4wfp4_pre_quant_atomic.py similarity index 83% rename from op_tests/triton_tests/test_gemm_a16wfp4.py rename to op_tests/triton_tests/test_gemm_afp4wfp4_pre_quant_atomic.py index 50618c963b..87b887d07c 100644 --- a/op_tests/triton_tests/test_gemm_a16wfp4.py +++ b/op_tests/triton_tests/test_gemm_afp4wfp4_pre_quant_atomic.py @@ -1,19 +1,17 @@ import torch import pytest -from aiter.ops.triton.gemm_a16wfp4 import gemm_a16wfp4 +from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant import aiter.ops.triton.utils._triton.arch_info as arch_info # Note this is specified by the HW and cannot be changed. SCALE_GROUP_SIZE = 32 -def generate_gemm_a16wfp4_inputs( +def generate_gemm_afp4wfp4_pre_quant_inputs( M: int, N: int, K: int, output: bool, - atomic_add: bool, - dtype: bool, layout: str = "TN", ): torch.manual_seed(5) @@ -51,8 +49,7 @@ def generate_gemm_a16wfp4_inputs( y = None if output: - dtype = torch.float32 if atomic_add else dtype - y = torch.zeros((M, N), device=x.device, dtype=dtype) + y = torch.zeros((M, N), device=x.device, dtype=torch.float32) return x, w, x_scales, w_scales, y @@ -92,9 +89,8 @@ def get_x_vals(): x_vals += [(2 ** (v - 1), 4096 * v, 4096 * v) for v in range(1, 6)] x_vals += [(16, 16384, 3328 * 2), (128, 16384, 3328 * 2)] x_vals += [(32, 512, 7168)] + x_vals += [(1, 1, SCALE_GROUP_SIZE)] # minimal case x_vals += [(1, 1280, 8192)] - x_vals += [(v, 7168, 2048) for v in [1, 4, 8, 32, 64, 128]] - # x_vals += [(1, 1, SCALE_GROUP_SIZE)] # minimal case, TODO: fix return x_vals @@ -147,10 +143,7 @@ def run_torch(x, w, w_scales, dtype): @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) @pytest.mark.parametrize("layout", ["TN", "TT", "NN", "NT"]) @pytest.mark.parametrize("output", [True, False]) -@pytest.mark.parametrize("atomic_add", [True, False]) -def test_gemm_a16wfp4( - M: int, N: int, K: int, dtype, layout, output: bool, atomic_add: bool -): +def test_gemm_afp4_wfp4_pre_quant(M: int, N: int, K: int, dtype, layout, output: bool): if not (arch_info.is_fp4_avail()): pytest.skip("MXFP4 not supported on this architecture") @@ -160,16 +153,13 @@ def test_gemm_a16wfp4( if M == 4864 and N == 8192 and K == 4160: pytest.skip("Skipping this config. due to compilation error.") - x, w, _, w_scales, y = generate_gemm_a16wfp4_inputs( - M, N, K, output=output, atomic_add=atomic_add, dtype=dtype, layout=layout + x, w, _, w_scales, y = generate_gemm_afp4wfp4_pre_quant_inputs( + M, N, K, layout=layout, output=output ) - y_dtype = torch.float32 if atomic_add else dtype if output: - y = gemm_a16wfp4(x, w, w_scales, atomic_add=atomic_add, dtype=y_dtype, y=y).to( - dtype - ) + y = gemm_afp4wfp4_pre_quant(x, w, w_scales, torch.float32, y).to(dtype) else: - y = gemm_a16wfp4(x, w, w_scales, atomic_add=atomic_add, dtype=y_dtype).to(dtype) + y = gemm_afp4wfp4_pre_quant(x, w, w_scales, torch.float32).to(dtype) torch_out = run_torch(x, w, w_scales, dtype).to(dtype)