diff --git a/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py b/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py index 04157064bb..d17ad95af0 100644 --- a/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py +++ b/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py @@ -10,16 +10,24 @@ def _rmsmorm_op(row, weight, n_cols, epsilon): row_norm = tl.sum(row_norm, axis=-1) norm_factor = tl.math.rsqrt((row_norm / n_cols) + epsilon) - rms_norm = row * norm_factor * weight + rms_norm = row * norm_factor[:, None] * weight return rms_norm +@triton.heuristics( + { + "EVEN_M_N": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0 + and args["N1"] % (args["BLOCK_SIZE_N"]) == 0, + "EVEN_M_N2": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0 + and args["N2"] % (args["BLOCK_SIZE_N2"]) == 0, + } +) @triton.jit def _fused_rms_mxfp4_quant_kernel( - inp1_ptr, - weight1_ptr, - inp2_ptr, - weight2_ptr, + x1_ptr, + w1_ptr, + x2_ptr, + w2_ptr, res1_ptr, out1_fp4_ptr, out1_bs_ptr, @@ -27,80 +35,177 @@ def _fused_rms_mxfp4_quant_kernel( out_res1_ptr, eps1, eps2, - n_rows, - inp1_n_cols, - inp2_n_cols, - inp1_row_stride, - inp2_row_stride, - res1_row_stride, - out1_fp4_row_stride, - out1_bs_row_stride, - out1_bs_col_stride, - out2_row_stride, - out_res1_row_stride, - BLOCK_SIZE: tl.constexpr, + M, + N1, + N2, + x1_stride_m, + x2_stride_m, + res1_stride_m, + out1_fp4_stride_m, + out1_bs_stride_m, + out1_bs_stride_n, + out2_stride_m, + out_res1_stride_m, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_N2: tl.constexpr, MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, - SKIP_SECOND_INPUT: tl.constexpr, + HAS_SECOND_INPUT: tl.constexpr, FIRST_INPUT_RES: tl.constexpr, + SCALE_N: tl.constexpr, + SCALE_M_PAD: tl.constexpr, + SCALE_N_PAD: tl.constexpr, + SHUFFLE: tl.constexpr, + SHUFFLE_PAD: tl.constexpr, + EVEN_M_N: tl.constexpr, + EVEN_M_N2: tl.constexpr, ): + # TODO: XCD remapping where every 32-token block should share the same XCD + # TODO: debug for large M + # TODO: investigate cache_modifier='.cg' on tl.store pid = tl.program_id(0) - NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE // MXFP4_QUANT_BLOCK_SIZE - block_inds = tl.arange(0, BLOCK_SIZE) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + + if pid >= num_pid_m: + if HAS_SECOND_INPUT: + pid -= num_pid_m + x_offs_m = pid * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + x_offs_n2 = tl.arange(0, BLOCK_SIZE_N2) + mask2 = None + other2 = None + if not EVEN_M_N2: + mask2 = (x_offs_m < M)[:, None] & (x_offs_n2 < N2)[None, :] + other2 = 0.0 + + x2 = tl.load( + x2_ptr + x_offs_m[:, None] * x2_stride_m + x_offs_n2[None, :], + mask=mask2, + other=other2, + cache_modifier=".cg", + ).to(tl.float32) + + w_mask2 = None + w_other2 = None + if not EVEN_M_N2: + w_mask2 = x_offs_n2 < N2 + w_other2 = 0.0 + + w2 = tl.load(w2_ptr + x_offs_n2, mask=w_mask2, other=w_other2).to( + tl.float32 + ) + + norm2 = _rmsmorm_op(x2, w2, N2, eps2) - mask1 = block_inds < inp1_n_cols - inp1 = tl.load( - inp1_ptr + pid * inp1_row_stride + block_inds, + tl.store( + out2_ptr + x_offs_m[:, None] * out2_stride_m + x_offs_n2[None, :], + norm2.to(out2_ptr.type.element_ty), + mask=mask2, + cache_modifier=".cg", + ) + return + + x_offs_n = tl.arange(0, BLOCK_SIZE_N) + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N // MXFP4_QUANT_BLOCK_SIZE + x_offs_m = pid * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + + mask1 = None + other1 = None + if not EVEN_M_N: + mask1 = (x_offs_m < M)[:, None] & (x_offs_n < N1)[None, :] + other1 = 0.0 + + x1 = tl.load( + x1_ptr + x_offs_m[:, None] * x1_stride_m + x_offs_n[None, :], mask=mask1, - other=0.0, + other=other1, cache_modifier=".cg", ).to(tl.float32) + if FIRST_INPUT_RES: res1 = tl.load( - res1_ptr + pid * res1_row_stride + block_inds, + res1_ptr + x_offs_m[:, None] * res1_stride_m + x_offs_n[None, :], mask=mask1, - other=0.0, + other=other1, cache_modifier=".cg", ).to(tl.float32) - inp1 = inp1 + res1 + x1 = x1 + res1 + + w_mask1 = None + w_other1 = None + if not EVEN_M_N: + w_mask1 = x_offs_n < N1 + w_other1 = 0.0 - w1 = tl.load(weight1_ptr + block_inds, mask=mask1, other=0.0).to(tl.float32) + w1 = tl.load(w1_ptr + x_offs_n, mask=w_mask1, other=w_other1).to(tl.float32) - norm1 = _rmsmorm_op(inp1, w1, inp1_n_cols, eps1) - out1_fp4, out1_block_scales = _mxfp4_quant_op( - norm1, BLOCK_SIZE, 1, MXFP4_QUANT_BLOCK_SIZE + norm1 = _rmsmorm_op(x1, w1, N1, eps1) + out1_fp4, bs_e8m0 = _mxfp4_quant_op( + norm1, BLOCK_SIZE_N, BLOCK_SIZE_M, MXFP4_QUANT_BLOCK_SIZE ) - out1_fp4 = tl.ravel(out1_fp4) - out1_block_scales = tl.ravel(out1_block_scales) # store the results - half_block_inds = tl.arange(0, BLOCK_SIZE // 2) + half_x_offs_n = tl.arange(0, BLOCK_SIZE_N // 2) + out_mask1 = None + if not EVEN_M_N: + out_mask1 = (x_offs_m < M)[:, None] & (half_x_offs_n < (N1 // 2))[None, :] + tl.store( - out1_fp4_ptr + pid * out1_fp4_row_stride + half_block_inds, + out1_fp4_ptr + x_offs_m[:, None] * out1_fp4_stride_m + half_x_offs_n[None, :], out1_fp4, - mask=half_block_inds < (inp1_n_cols // 2), + mask=out_mask1, + cache_modifier=".cg", ) - bs_inds = tl.arange(0, NUM_QUANT_BLOCKS) - num_bs_cols = (inp1_n_cols + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE + + bs_offs_m = pid * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + bs_offs_n = tl.arange(0, NUM_QUANT_BLOCKS) + num_bs_cols = (N1 + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE + if SHUFFLE: + bs_offs_0 = bs_offs_m[:, None] // 32 + bs_offs_1 = bs_offs_m[:, None] % 32 + bs_offs_2 = bs_offs_1 % 16 + bs_offs_1 = bs_offs_1 // 16 + bs_offs_3 = bs_offs_n[None, :] // 8 + bs_offs_4 = bs_offs_n[None, :] % 8 + bs_offs_5 = bs_offs_4 % 4 + bs_offs_4 = bs_offs_4 // 4 + bs_offs = ( + bs_offs_1 + + bs_offs_4 * 2 + + bs_offs_2 * 2 * 2 + + bs_offs_5 * 2 * 2 * 16 + + bs_offs_3 * 2 * 2 * 16 * 4 + + bs_offs_0 * 2 * 16 * SCALE_N_PAD + ) + bs_mask_127 = (bs_offs_m < M)[:, None] & (bs_offs_n < num_bs_cols)[None, :] + bs_e8m0 = tl.where(bs_mask_127, bs_e8m0, 127) + else: + bs_offs = ( + bs_offs_m[:, None] * out1_bs_stride_m + + bs_offs_n[None, :] * out1_bs_stride_n + ) + + bs_mask = None + if not EVEN_M_N: + if SHUFFLE_PAD: + bs_mask = (bs_offs_m < SCALE_M_PAD)[:, None] & (bs_offs_n < SCALE_N_PAD)[ + None, : + ] + else: + bs_mask = (bs_offs_m < M)[:, None] & (bs_offs_n < SCALE_N)[None, :] + tl.store( - out1_bs_ptr + pid * out1_bs_row_stride + bs_inds * out1_bs_col_stride, - out1_block_scales, - mask=bs_inds < num_bs_cols, + out1_bs_ptr + bs_offs, + bs_e8m0.to(out1_bs_ptr.type.element_ty), + mask=bs_mask, + cache_modifier=".cg", ) - if not SKIP_SECOND_INPUT: - mask2 = block_inds < inp2_n_cols - inp2 = tl.load( - inp2_ptr + pid * inp2_row_stride + block_inds, - mask=mask2, - other=0.0, - cache_modifier=".cg", - ).to(tl.float32) - w2 = tl.load(weight2_ptr + block_inds, mask=mask2, other=0.0).to(tl.float32) - norm2 = _rmsmorm_op(inp2, w2, inp2_n_cols, eps2) - tl.store(out2_ptr + pid * out2_row_stride + block_inds, norm2, mask=mask2) + if FIRST_INPUT_RES: - inp1 = inp1.to(out_res1_ptr.dtype.element_ty) tl.store( - out_res1_ptr + pid * out_res1_row_stride + block_inds, inp1, mask=mask1 + out_res1_ptr + x_offs_m[:, None] * out_res1_stride_m + x_offs_n[None, :], + x1.to(out_res1_ptr.dtype.element_ty), + mask=mask1, + cache_modifier=".cg", ) diff --git a/aiter/ops/triton/activation.py b/aiter/ops/triton/activation.py index 7bb7bbee15..b52cf465eb 100644 --- a/aiter/ops/triton/activation.py +++ b/aiter/ops/triton/activation.py @@ -19,6 +19,7 @@ def act_mul_and_mxfp4_quant( activation: Literal["silu", "gelu", "gelu_tanh"], scaling_mode: str = "even", shuffle: bool = False, + scale_shuffle_padding: bool = False, ) -> tuple[torch.Tensor, torch.Tensor]: """ Apply the activation function and quantize the result to MX FP4 format. @@ -53,22 +54,18 @@ def act_mul_and_mxfp4_quant( x_fp4 = torch.empty((M, N_half // 2), dtype=torch.uint8, device=x.device) scaleN_valid = triton.cdiv(N_half, MXFP4_QUANT_BLOCK_SIZE) # Setting scale M to be multiple of 256 and scale N to be multiple of 8 - if shuffle: + use_scale_shuffle_padding = shuffle or scale_shuffle_padding + if use_scale_shuffle_padding: scaleM = triton.cdiv(M, 256) * 256 scaleN = triton.cdiv(scaleN_valid, 8) * 8 - blockscale_e8m0 = torch.empty( - (scaleM, scaleN), - dtype=torch.uint8, - device=x.device, - ) else: scaleM = M scaleN = scaleN_valid - blockscale_e8m0 = torch.empty( - (scaleN, scaleM), - dtype=torch.uint8, - device=x.device, - ).T + blockscale_e8m0 = torch.empty( + (scaleM, scaleN), + dtype=torch.uint8, + device=x.device, + ) # for large N values if M <= 32: @@ -116,7 +113,7 @@ def act_mul_and_mxfp4_quant( SCALING_MODE=0, ACTIVATION=activation, scaleN=scaleN_valid, - scaleM_pad=scaleM, + scaleM_pad=(scaleM if use_scale_shuffle_padding else 1), scaleN_pad=scaleN, SHUFFLE=shuffle, NUM_ITER=NUM_ITER, diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=1280-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=1280-K=8192.json new file mode 100644 index 0000000000..e8a1f34311 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=1280-K=8192.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 8 + }, + "small_M16": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 8 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 8 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=14336-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=14336-K=8192.json new file mode 100644 index 0000000000..d439bf813a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=14336-K=8192.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=2560-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=2560-K=8192.json new file mode 100644 index 0000000000..eb1a181677 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=2560-K=8192.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 8 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=28672-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=28672-K=8192.json new file mode 100644 index 0000000000..c7d1135e3e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=28672-K=8192.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=5120-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=5120-K=8192.json new file mode 100644 index 0000000000..54f124a65a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=5120-K=8192.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "small_M16": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=7168-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=7168-K=8192.json new file mode 100644 index 0000000000..223718e553 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=7168-K=8192.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=1024.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=1024.json new file mode 100644 index 0000000000..d8a566b2fc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=1024.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=14336.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=14336.json new file mode 100644 index 0000000000..59a8b79d13 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=14336.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=2048.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=2048.json new file mode 100644 index 0000000000..6646098e8a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=2048.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=3584.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=3584.json new file mode 100644 index 0000000000..3f15ffda64 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=3584.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=4096.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=4096.json new file mode 100644 index 0000000000..3428678a8b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=4096.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=7168.json new file mode 100644 index 0000000000..31e704475b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4_PRESHUFFLED-N=8192-K=7168.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "small_M16": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 8 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 9a721b6230..a92d1d94da 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 218b217517..cf000a5aea 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e90d4ba9cf14219bef1bca72767ed05991913eb79484a5b706cb25d9f2f71474", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 43520, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "4971efc8d6396be9b0df4db743227b0777c6aa214766b931a446d515ce1a8695", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..3170e87c52 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..053718dae0 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f70c711c78f7418d6182e8cfd2b0d0211ab59b720b83dcbbfd09de4594147fb5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..339d8b60b3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..30a971a865 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "50630419988688add7ab5f7992729c367ac76cd80edb7ed14b1c4f86a6af5938", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0a9d32b9b3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..d36b270856 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "50524344e989cc18ac62628fc02d73d4163eb245c05d3868c90e9efe40f885ea", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..fd439c98f0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..2c41af514d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "900c95ea5068e521cac115dba2f5a39c95629558de25edd9ed355a2bca806bc9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..1c6d43bf8b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..87d5d11a93 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "70389c6a931f80ad9dabdafe6366f140480dc46d3546e1a47f6e8038dffbbcbe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index a76261d24b..fb4cfcdbb1 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 4485032426..ad581d4eb7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e80a3f3a19a5da27236f25e468c4b22caa88c28f65793d17c3d2045fe972817c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "bf4271e7a83f7f7e1c1b4d82c565f2c0599c4cbcf518e758b1992254783f6b47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..f4f2b5f242 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..9d8fbfacef --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "576f5ce01ce40e9047a2afe995e3d1a807b4d8ba89ccfde6875bbcdf1bedc771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a07265a92c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..ca1818a8b5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "87c999f39f544c659c4c4b3649617c5cfab67bc2a5df8a26c6227aa4cf4ea998", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..164a2daa55 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..4aea58cafe --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "7d4eee9825f37b996ae066357854ac66af67d8ff733626eb7ee22f12d90c425a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..11a81135b7 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..b62a824706 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "950f33955df085252de58736ed2fa6ca548cd5920743ecf8df8ce38db3a27bf1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 19aa40e784..caf2d9663c 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 1e5bb1dfae..c3c9f54b2b 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "cb12dc32b0ed1a5ac880a6dd3bee50fb59d11e1a8eeccc3ae8153c968e7f2c75", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 13056, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "d794139ad9c7a2aa2f2fb6efaef5771241cd987dd8be123f9af2ee45a41127fa", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0f16aad417 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..0c64b2b637 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "3e3805a7ab6809b0520f39281f72e918acfd2568d5b7d1852b7aa65ff6dede2f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d216f4db45 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..1282fab52e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "b54b063171df37071e5d216a95968f9b0071bfc3f1dac8a6507d7d3412b3b2c2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4b472f387c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..24580e0921 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "af947e9c60407171fac651b5e73064363155003f3949a1280322f28c0bc82174", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 8dcc5280de..1a354efcee 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index fca09fa225..b81c00af1a 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "347e0c55794ac0ca235e8b969a4b5a5268100a128f24dcce30fe2005b2bc21b1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 16896, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "5dccf75c4c6643db197699c190cf3be8883f25f5c1b82c171c1e1f9a5acf5a54", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index febfd8cf3b..4d9bd9f32e 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 84e66f815f..cf1de878a4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "a729967cd59e3c39a6f61dd259cc2b7cd9768909003d37d03d9dc7dae7280b9e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 52224, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "1eea6a6f69efd9adbfc722daf6f70fbf96785d4c2536c27089af2750e93f1007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..effc769ed2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..3e7d0dbdf7 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f2d1a87b6a5690047dc9744b4dcbf4e6ca2d231295a0eb21b4de4d2d57d5c452", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..3a94d7f3f9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..2df2522e1c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "97cdf5b137cd798fc01173124f1fe7f434603233131809f90b9122692b5e0691", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5c9fa23455 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..f32f36bceb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "7a8df9f76c7249c0727020baab6fc5a45bff3f61821a017af0bcdcb31a158d51", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2b7333aab5 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..6e600e607e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "a098a54b9f5edd21bdbd84ba2ae5bc0f6493e3ae9e7fbe11ba4f755d7d33b2c8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9f9940f169 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..f0dd4491d2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ed379549185fb90aacc990a30e95b69837b80e2ba48fcbe8b2c328df34d911d2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 1620f0f01d..b2a2cfed56 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 56a2dfd70c..779fd69d94 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "a38121d8f5709315553f0016ca0e08c77bfd16fd57e336ed676b85615be00762", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 13056, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "3f103db1b06fbcc5665da2d706abfc42ab421a1e2136147b1fb729db1aa0c1de", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..07d026c9bc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..644acdd15d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c468b044688faad941ae6530c535e4dc5ccab9ec70b273112a98fe310e96fbe5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..517dd3009b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c18ad7066a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "00a3f26ad3db5e526e2ffc540824d99e1c060a1d6a6d27796dc7b6d5e2f28128", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 1, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8192, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..75c2c0f392 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..74fa020cf3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "1f80213aa7e03eafff54b07fffeecfbe5013f46fdaad2a5092c34cccd87c2115", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d96d7107d0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..343d8fdce0 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "06c044aea0bc437a4798835deb75891e2ca4f556f7d00f2f3139a895210cbb8b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index cfdd8d48cc..a69de9121f 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index fc4ab35831..d8d02b17f6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "4af84e6c0b5acb21f71e7f71ab43f43a465dd74734d7c6def0d9fc859c471c1f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "fddfd666ae3dc3d6b62572367cdeafad40d6e4e6bb921f30391f7428b1e1e338", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..786585e8e6 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..638a5fd66a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "674006f4c8ea7904eecc04cb91ea7fd771ffb64c1b070a084ec21c03ffe1f1c2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..3f9b8bc0f8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..5cc7a5b2d1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "dc7719c21d6c20b721db205eed9d3b7e5b88c2259f331a26d538e2b9da4193f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..161cc7f778 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..0f45827489 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "40729b4e2ffb586195b561a7924416e04e15972dcf0e08b6be64b5979c49d7be", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index febfd8cf3b..4d9bd9f32e 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 84e66f815f..cf1de878a4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "a729967cd59e3c39a6f61dd259cc2b7cd9768909003d37d03d9dc7dae7280b9e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 52224, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "1eea6a6f69efd9adbfc722daf6f70fbf96785d4c2536c27089af2750e93f1007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index d5d5dde2c6..2a8f53e1ec 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 046da07114..b13e41cf45 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "41f673542e895bf56edb8e6a137febf789c28a9da5b4693a1065490a62336656", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 43520, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d3f3b6944e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..94f8f302d4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d854a26d44 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..18518c7114 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ddfa86a175 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..630d0cf7be --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..878752918d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7235ff4fd4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..db028776cd Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..f2eef37457 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 6ce75e81b2..5f22b90628 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 7b2c5ab8de..aa851bad5e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1b29a972364a81e3844504157096f1a0ca2164836cee9758c885f562921d6f0c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1bb487e35 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..307d8cda4e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..43952c536a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..65d5835012 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9e5e0d0b33 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..83ae9f44b8 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e06ba3b5e8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..dfcc4c6fe5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index a43f1de3b1..0703f1c3c5 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ac8df07cd5..441977fcda 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "96e931c980f9bd1b0d7ba209973d637dae985113c68e23d0476ea6a3789b77f4", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 13056, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..bf18531344 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..146ca2148f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97fc49e9ef Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7180f472b1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4474a4cbba Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c02d8df2eb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index bdcde11a55..625ebad0aa 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 3c5b94ec80..56bed44a0c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3a4dedb1720cc3cc439e59bf26afb541bb9591e17d932fe1696e4ac46fe1c376", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 16896, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index ba18fd23df..6720a67345 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 8dc6746eff..b94e9b5868 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "518af245b3686a62c8aae8b677a2e83177124a639e544e12c11c00b9797474fd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "69c533376c135f1466f40015aa8dbb2e47737901c0704ebae287d5a6c817625c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 4864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97be1f6400 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..b9d277c8a3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ed3a75371cec725aed630ea65b4fa508941952f5e0c9471fa93107b230a4f03b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9bde266a37 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..0c62a8849a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ee7c8bc727b05b5294121866e6263da1994dc1ee7734e4888045d513b10cb4d4", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..59feb2de7f Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..41c58a031b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "98a5f489f33182cd98a113529f0648b30b5042f9318b52363b3e9c54368c2f79", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1d87be86c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..4a266b4bdb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ce7a874c46162d4a686ded6b749e772fdd69eb8099e1788200e24e28b3b714e6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a51b2294a3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..ee907f79de --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cd35a510fafc921b16596ea3787adfac00ce14bd5ca2f8194c08a2d8ce625c63", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index cf679ceb39..a70e0a6ed4 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 62c83cddea..70876a4dfe 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1737c0a38627fe5406a6244d0c66b46e3b98dfd8daf99c31b2c2ab219ffd8249", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ab1b8066b3ca873af3ef3ef52ab68a54be546b27071022826927f25df768ad7f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..0214033b0d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..afda0cc597 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "d5ae75cc2a5e451f4541f51892bebc6bade5f9e0fc50a14924d5df9d3e862ab1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..6da53f98e2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..80d0d783bd --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "dd02aea24f07a469be50cb48315080339bec4331fb29dc3ab324044e4fad83d9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7de88c2416 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..0cac508631 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f2988946eb27a029d039bab8743ccf128ede0f91ff0bdcabd59a3d0a8737b90a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..839ba3f892 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..47c0104cea --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "3d1105e6d7346400d4523456e30259ea586690de52c26e4bd7a2c6fdbd75d2c5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index b34288dac8..e58dac5e89 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 859c3bf7e2..2be5a94c79 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "84f327b5729d25ec4ad344f8a9b211f9c9786815df9873b33e1a44d2cdf8e580", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "7ece21689ec170d622f73f8dd019d6603006f843506aa999a4ba733398455007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..5a5927322e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..23034f50b9 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "0d0e2c47e7ca82c5ca8e47b5b51e21ae0139be3b70bc174af2be8545770544e0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..677161f2e2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..e25967cdc2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f9bac69d0a515dec752a6cd3498979c6d2e5fa55f1f20c6c2e68de845e4c0709", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ab66b9aac0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..ba5641bae4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "487a67f0a0313c1afa4b0aa5dbeee4606311eaad808e0e7b69875fcb29b1edb9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index b7504c5898..06164a3817 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 8b0d6ebf34..52d755e6a7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "4f511f2573c219ee1928e586a5facd24ea5ddbd2f6314d14387f45c2ca36905b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 43008, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "491ea027fe915421c2c388460048bda1909bbb2fe234f26aa650cce8f2b1f5f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index d5d5dde2c6..2a8f53e1ec 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 046da07114..b13e41cf45 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "41f673542e895bf56edb8e6a137febf789c28a9da5b4693a1065490a62336656", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 43520, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d3f3b6944e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..94f8f302d4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d854a26d44 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..18518c7114 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ddfa86a175 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..630d0cf7be --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..878752918d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7235ff4fd4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..db028776cd Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..f2eef37457 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 6ce75e81b2..5f22b90628 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 7b2c5ab8de..aa851bad5e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1b29a972364a81e3844504157096f1a0ca2164836cee9758c885f562921d6f0c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1bb487e35 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..307d8cda4e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..43952c536a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..65d5835012 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9e5e0d0b33 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..83ae9f44b8 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e06ba3b5e8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..dfcc4c6fe5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index a43f1de3b1..0703f1c3c5 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ac8df07cd5..441977fcda 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "96e931c980f9bd1b0d7ba209973d637dae985113c68e23d0476ea6a3789b77f4", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 13056, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..bf18531344 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..146ca2148f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97fc49e9ef Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7180f472b1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4474a4cbba Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c02d8df2eb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index bdcde11a55..625ebad0aa 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 3c5b94ec80..56bed44a0c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3a4dedb1720cc3cc439e59bf26afb541bb9591e17d932fe1696e4ac46fe1c376", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 16896, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 36a56dbea1..1cf1324d04 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index a6f0809dde..5f9f3d73c6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "f22922a7294924d71ca6c72a6b4ac34c07ff79ccf09d45e9fea0fcec2660ee0c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "8442b59cd54bfc72a3bed8dd9aacc04807eeff7f628d351975e80daeeb8c07b2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a582768be3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..227d75eb8e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "304af16ef6752d5164f9f17fd233db9fa50ab36dfb098ed207eeded7ff62fe2f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8704, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1d87be86c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..4a266b4bdb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ce7a874c46162d4a686ded6b749e772fdd69eb8099e1788200e24e28b3b714e6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7ab0ef739d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..14bfdc0ba7 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "b01f43ede2d3f0f3d7058a795400af47f6d1e9602413dc8926d63d2c7056c74a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 36864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7b2dc1c3ec Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..447c6a6175 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "7526c8a1b2ea0dd2646354a0a3fc36c41ff4b1e46d1f24d4994f03ce10cbfd50", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4484a8d7ff Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..1a94f54cc4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "66c96ad140c5c0362b52113637538dabdb72593f77e280d7d9894f3e565863b8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index b5e58ce97c..29d777447c 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 9f7f4d9500..e720580e2d 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "7ee2132b54aabbbef6a1f5cc7a99ad94d8c6ee8420e5a6fb8702168c0df06a5d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "952567c0aad2f93638b1a9f6c7b73712b04bbcec87dcb455d62aba552ea88c23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97adaa1c6a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..741b678759 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "5cf1d8f50dfd5ca23bd5a74ef61af0b11ac8a7954942d1cf721029569da9db61", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..f7aff5c000 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c737064a11 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f7afa37c6f7c464bac61859ac2a2eb1845baf4a665c09d03dfb371135b442ed5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..89db920df9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..8264c424c9 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ce178e99a4c7c1317d9d74002f85df6166b0ce8be2e81a0278015f94a98f8568", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 18432, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..7bca47bc78 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..8d6754c157 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "5f77dc9bedad300205059b17200cec0a92128347a2e1163f97475d1ac61f36d0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 4240460535..e05feb3d94 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 5373477c35..d9e06080a5 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1007fa9b77c1c41ab0d7f1875b4474e4c8e58481c2f80bfcfbe0ee0131caa0e5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "2ca8102db9b04514810267d567d9111a674640efa166eda7ac964f2c9b62e741", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..a68caf2419 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..129f6593aa --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "b8c2841a693dc9b73f1792c733bccb56ed63fbad8b5b0818edf2ab6ab963dadd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..2b1e91002c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..fa4dfc7269 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "762edd1695475aecc55e561d12419f63581b9f2ba898b37a2b1b23bf316fc823", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..b8395db679 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..a88a1a528c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "e22eb2008573c5f836861cb1efaf8f54828bffbb6f2c6f1e1e1f96374e36c9c7", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index ec1d731f5a..5d30e79cfc 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index b0be146e7f..a64d2bad03 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "e9a4e058b4c9508aa7b4c8c5c8ff9bba7f3a3c069f2492dbac912115e7a4108a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 77824, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "f0ea00fc779e1822790bc4801a6bc5b8ca6c6859ecdeca7d368233cff1b06c66", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index d5d5dde2c6..2a8f53e1ec 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 046da07114..b13e41cf45 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "41f673542e895bf56edb8e6a137febf789c28a9da5b4693a1065490a62336656", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 43520, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d3f3b6944e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..94f8f302d4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..d854a26d44 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..18518c7114 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..ddfa86a175 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..630d0cf7be --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..878752918d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7235ff4fd4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..db028776cd Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..f2eef37457 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index 6ce75e81b2..5f22b90628 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 7b2c5ab8de..aa851bad5e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "1b29a972364a81e3844504157096f1a0ca2164836cee9758c885f562921d6f0c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e1bb487e35 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..307d8cda4e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..43952c536a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..65d5835012 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..9e5e0d0b33 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..83ae9f44b8 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..e06ba3b5e8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..dfcc4c6fe5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index a43f1de3b1..0703f1c3c5 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index ac8df07cd5..441977fcda 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "96e931c980f9bd1b0d7ba209973d637dae985113c68e23d0476ea6a3789b77f4", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 13056, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..bf18531344 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..146ca2148f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..97fc49e9ef Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..7180f472b1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco new file mode 100644 index 0000000000..4474a4cbba Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json new file mode 100644 index 0000000000..c02d8df2eb --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -0,0 +1 @@ +{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco index bdcde11a55..625ebad0aa 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json index 3c5b94ec80..56bed44a0c 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json @@ -1 +1 @@ -{"hash": "3a4dedb1720cc3cc439e59bf26afb541bb9591e17d932fe1696e4ac46fe1c376", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "AMDGCN_USE_BUFFER_OPS": "true", "TRITON_HIP_USE_ASYNC_COPY": "true", "TRITON_HIP_USE_BLOCK_PINGPONG": "true", "triton_version": "3.5.0", "shared": 16896, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/fused_mxfp4_quant.py b/aiter/ops/triton/fused_mxfp4_quant.py index a2b0b5d6e3..141bf6d2fe 100644 --- a/aiter/ops/triton/fused_mxfp4_quant.py +++ b/aiter/ops/triton/fused_mxfp4_quant.py @@ -1,6 +1,7 @@ import torch import triton import triton.language as tl +from typing import Optional from aiter.ops.triton._triton_kernels.fused_mxfp4_quant import ( _rmsmorm_op, @@ -13,20 +14,22 @@ def fused_rms_mxfp4_quant( - inp1, - inp1_weight, - inp1_epsilon, - inp2=None, - inp2_weight=None, - inp2_epsilon=0.0, - res1=None, + x1: torch.Tensor, + x1_weight: torch.Tensor, + x1_epsilon: float, + x2: Optional[torch.Tensor] = None, + x2_weight: Optional[torch.Tensor] = None, + x2_epsilon: float = 0.0, + res1: Optional[torch.Tensor] = None, + shuffle: Optional[bool] = False, + scale_shuffle_padding: Optional[bool] = False, ): """ This op contains several steps: - 1. if res1 is not None, inp1 = inp1 + res1, and store inp1 to out_res1 - 2. perform RMS norm along the last dimenion for inp1 - 3. if inp2 is not None, perform RMS norm along the last dimenion for inp2 - 4. perform mxfp4 quantization for inp1 only + 1. if res1 is not None, x1 = x1 + res1, and store x1 to out_res1 + 2. perform RMS norm along the last dimenion for x1 + 3. if x2 is not None, perform RMS norm along the last dimenion for x2 + 4. perform mxfp4 quantization for x1 only Key parameters: - x: Matrix X with shape (M, N1, N2). @@ -37,84 +40,97 @@ def fused_rms_mxfp4_quant( - out2: The output matrix with shape (M, N2). - out_res1: The output matrix with shape (M, N1). - if both inp2 and res1 provided, return (out1_fp4, out1_bs), out2, out_res1 - if inp2 provided, return (out1_fp4, out1_bs), out2 + if both x2 and res1 provided, return (out1_fp4, out1_bs), out2, out_res1 + if x2 provided, return (out1_fp4, out1_bs), out2 if res1 provided, return (out1_fp4, out1_bs), out_res1 - if both inp2 and res1 not provided, return (out1_fp4, out1_bs) + if both x2 and res1 not provided, return (out1_fp4, out1_bs) """ - _LOGGER.info(f"FUSED_RMS_MXFP4_QUANT: inp1={tuple(inp1.shape)}") + _LOGGER.info(f"FUSED_RMS_MXFP4_QUANT: inp1={tuple(x1.shape)}") + MXFP4_QUANT_BLOCK_SIZE = 32 - M, N1 = inp1.shape - BLOCK_SIZE = max(triton.next_power_of_2(N1), MXFP4_QUANT_BLOCK_SIZE) - if inp2 is not None: - N2 = inp2.shape[1] - BLOCK_SIZE = max(triton.next_power_of_2(N2), BLOCK_SIZE) + M, N1 = x1.shape + BLOCK_SIZE_N = max(triton.next_power_of_2(N1), MXFP4_QUANT_BLOCK_SIZE) + BLOCK_SIZE_N2 = 1 + if x2 is not None: + N2 = x2.shape[1] + BLOCK_SIZE_N2 = triton.next_power_of_2(N2) else: N2 = 0 # as we merge 2 fp4s to 1 uint8 assert N1 % 2 == 0 - - BLOCK_SIZE = max(BLOCK_SIZE, MXFP4_QUANT_BLOCK_SIZE) - out1_fp4 = torch.empty((M, N1 // 2), dtype=torch.uint8, device=inp1.device) + BLOCK_SIZE_M = 1 + # BLOCK_SIZE_M = 32 + BLOCK_SIZE_N = max(BLOCK_SIZE_N, MXFP4_QUANT_BLOCK_SIZE) + out1_fp4 = torch.empty((M, N1 // 2), dtype=torch.uint8, device=x1.device) + SCALE_N_valid = triton.cdiv(N1, MXFP4_QUANT_BLOCK_SIZE) + use_scale_shuffle_padding = shuffle or scale_shuffle_padding + if use_scale_shuffle_padding: + SCALE_M = triton.cdiv(M, 256) * 256 + SCALE_N = triton.cdiv(SCALE_N_valid, 8) * 8 + # BLOCK_SIZE_M = triton.cdiv(BLOCK_SIZE_M, 32) * 32 + BLOCK_SIZE_N = triton.cdiv(BLOCK_SIZE_N, 32) * 32 + else: + SCALE_M = M + SCALE_N = SCALE_N_valid out1_bs = torch.empty( - ((N1 + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE, M), + (SCALE_M, SCALE_N), dtype=torch.uint8, - device=inp1.device, - ).T + device=x1.device, + ) out_res1 = None - res1_row_stride = 0 - out_res1_row_stride = 0 + res1_stride_m = 0 + out_res1_stride_m = 0 if res1 is not None: - out_res1 = torch.empty((M, N1), dtype=inp1.dtype, device=inp1.device) - res1_row_stride = res1.stride(0) - out_res1_row_stride = out_res1.stride(0) + out_res1 = torch.empty((M, N1), dtype=x1.dtype, device=x1.device) + res1_stride_m = res1.stride(0) + out_res1_stride_m = out_res1.stride(0) out2 = None - out2_row_stride = 0 - inp2_row_stride = 0 - if inp2 is not None: - out2 = torch.empty((M, N2), dtype=inp1.dtype, device=inp1.device) - inp2_row_stride = inp2.stride(0) - out2_row_stride = out2.stride(0) - - _fused_rms_mxfp4_quant_kernel[(M,)]( - inp1, - inp1_weight, - inp2, - inp2_weight, + out2_stride_m = 0 + x2_stride_m = 0 + if x2 is not None: + out2 = torch.empty((M, N2), dtype=x1.dtype, device=x1.device) + x2_stride_m = x2.stride(0) + out2_stride_m = out2.stride(0) + + grid = (triton.cdiv(M, BLOCK_SIZE_M) * (2 if (x2 is not None) else 1),) + _fused_rms_mxfp4_quant_kernel[grid]( + x1, + x1_weight, + x2, + x2_weight, res1, out1_fp4, out1_bs, out2, out_res1, - inp1_epsilon, - inp2_epsilon, + x1_epsilon, + x2_epsilon, M, N1, N2, - inp1.stride(0), - inp2_row_stride, - res1_row_stride, + x1.stride(0), + x2_stride_m, + res1_stride_m, out1_fp4.stride(0), *out1_bs.stride(), - out2_row_stride, - out_res1_row_stride, - BLOCK_SIZE=BLOCK_SIZE, + out2_stride_m, + out_res1_stride_m, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_N=BLOCK_SIZE_N, + BLOCK_SIZE_N2=BLOCK_SIZE_N2, MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, - SKIP_SECOND_INPUT=(inp2 is None), + HAS_SECOND_INPUT=(x2 is not None), FIRST_INPUT_RES=(res1 is not None), + SCALE_N=SCALE_N_valid, + SCALE_M_PAD=(SCALE_M if use_scale_shuffle_padding else 1), + SCALE_N_PAD=SCALE_N, + SHUFFLE=shuffle, + SHUFFLE_PAD=use_scale_shuffle_padding, ) - if res1 is not None: - if inp2 is None: - return (out1_fp4, out1_bs), out_res1 - else: - return (out1_fp4, out1_bs), out2, out_res1 - else: - if inp2 is None: - return (out1_fp4, out1_bs) - else: - return (out1_fp4, out1_bs), out2 + + return (out1_fp4, out1_bs), out2, out_res1 def fused_flatten_mxfp4_quant( diff --git a/aiter/ops/triton/gemm_afp4wfp4.py b/aiter/ops/triton/gemm_afp4wfp4.py index 4011501965..ef5b97c615 100644 --- a/aiter/ops/triton/gemm_afp4wfp4.py +++ b/aiter/ops/triton/gemm_afp4wfp4.py @@ -329,6 +329,7 @@ def gemm_afp4wfp4_preshuffled_weight_scales( dtype: Optional[float] = torch.bfloat16, y: Optional[torch.Tensor] = None, config: Optional[dict] = None, + use_aot: Optional[bool] = True, ): """ Computes the matmul Y = X x W @@ -407,7 +408,7 @@ def gemm_afp4wfp4_preshuffled_weight_scales( if M < 32 and M_POW2 > 16: M_POW2 = 16 metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" - if os.path.exists(metadata_pth): + if use_aot and os.path.exists(metadata_pth): with AOTMetadataContext( _gemm_afp4_wfp4_kernel_preshuffled_weight_scales.fn.__name__, f"{metadata_pth}", diff --git a/op_tests/triton_tests/test_activation.py b/op_tests/triton_tests/test_activation.py index e011a0c7a7..be97c8a56d 100644 --- a/op_tests/triton_tests/test_activation.py +++ b/op_tests/triton_tests/test_activation.py @@ -2,8 +2,9 @@ import torch.nn.functional as F import pytest from .test_quant_mxfp4 import torch_dynamic_mxfp4_quant -from .test_gemm_afp4wfp4 import shuffle_scales +from .test_gemm_afp4wfp4 import shuffle_scales, un_shuffle_scales from aiter.ops.triton.activation import act_mul_and_mxfp4_quant +import aiter.ops.triton.utils._triton.arch_info as arch_info DEBUG_MODE = False @@ -20,7 +21,9 @@ def pad_tensor_2d(tensor, mult_m=256, mult_n=8): return padded_tensor -def torch_act_mul_and_mxfp4_quant(input: torch.Tensor, activation: str) -> torch.Tensor: +def torch_act_mul_and_mxfp4_quant( + input: torch.Tensor, activation: str, shuffle: bool +) -> torch.Tensor: """ The fused kernel casts the original input to float32 and does all the arithmetic and bit operations in float32. @@ -34,12 +37,31 @@ def torch_act_mul_and_mxfp4_quant(input: torch.Tensor, activation: str) -> torch out = F.gelu(x) * y else: out = F.gelu(x, approximate="tanh") * y - return torch_dynamic_mxfp4_quant(out) + out, out_scale = torch_dynamic_mxfp4_quant(out) + if shuffle: + # out_scale_pad = out_scale + M = out_scale.shape[0] + N = out.shape[1] * 2 + scaleM = (M + 255) // 256 * 256 + scaleN_valid = (N + 31) // 32 + scaleN = (scaleN_valid + 7) // 8 * 8 + out_scale_pad = torch.empty( + (scaleM, scaleN), dtype=out_scale.dtype, device=out_scale.device + ) + out_scale_pad[:M, :scaleN] = out_scale[:M, :scaleN] + out_scale = shuffle_scales(out_scale_pad) + out_scale = out_scale.view(out_scale.shape[0] * 32, -1) + return out, out_scale @pytest.mark.parametrize( "M, N", [ + (512, 57344), + (504, 57344), + (1, 57344), + (4, 57344), + (32, 8192), (1, 4), (1, 28), (1, 32), @@ -66,34 +88,52 @@ def torch_act_mul_and_mxfp4_quant(input: torch.Tensor, activation: str) -> torch @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @pytest.mark.parametrize("shuffle", [False, True]) -def test_act_mul_and_mxfp4_quant(M: int, N: int, dtype, activation: str, shuffle: bool): - # TODO: extend tests to different shapes with proper padding - if shuffle and (M % 256 != 0 or N % 512 != 0): - pytest.skip() +@pytest.mark.parametrize("scale_shuffle_padding", [False, True]) +def test_act_mul_and_mxfp4_quant( + M: int, N: int, dtype, activation: str, shuffle: bool, scale_shuffle_padding: bool +): - torch.manual_seed(20) + if not (arch_info.is_fp4_avail()): + pytest.skip("MXFP4 not supported on this architecture") - torch.cuda.empty_cache() # Helps avoid hangs in large tests + if shuffle and N % 512 != 0: + pytest.skip() + torch.manual_seed(20) x = torch.randn((M, N), dtype=dtype, device="cuda") if DEBUG_MODE: print(f"x.shape={x.shape} x={x}") triton_out, triton_scale = act_mul_and_mxfp4_quant( - x, activation=activation, shuffle=shuffle + x, + activation=activation, + shuffle=shuffle, + scale_shuffle_padding=scale_shuffle_padding, ) if DEBUG_MODE: print(f"triton_out.shape={triton_out.shape} triton_out={triton_out}") print(f"triton_scale.shape={triton_scale.shape} triton_scale={triton_scale}") - torch_out, torch_scale = torch_act_mul_and_mxfp4_quant(x, activation=activation) + torch_out, torch_scale = torch_act_mul_and_mxfp4_quant( + x, activation=activation, shuffle=shuffle + ) + if shuffle: - torch_scale = shuffle_scales(torch_scale) - triton_scale = triton_scale.reshape(triton_scale.shape[0] // 32, -1) + triton_scale = un_shuffle_scales( + triton_scale.view(triton_scale.shape[0] // 32, -1) + ) + torch_scale = un_shuffle_scales( + torch_scale.view(torch_scale.shape[0] // 32, -1) + ) + if DEBUG_MODE: print(f"torch_out.shape={torch_out.shape} torch_out={torch_out}") print(f"torch_scale.shape={torch_scale.shape} torch_scale={torch_scale}") + scaleN_valid = (N // 2 + 31) // 32 + triton_scale = triton_scale[:M, :scaleN_valid] + torch_scale = torch_scale[:M, :scaleN_valid] + torch.testing.assert_close(triton_out, torch_out) torch.testing.assert_close(triton_scale, torch_scale) diff --git a/op_tests/triton_tests/test_fused_mxfp4_quant.py b/op_tests/triton_tests/test_fused_mxfp4_quant.py index fef6438bcf..8c66a8aa26 100644 --- a/op_tests/triton_tests/test_fused_mxfp4_quant.py +++ b/op_tests/triton_tests/test_fused_mxfp4_quant.py @@ -10,6 +10,7 @@ e8m0_to_f32, SCALE_GROUP_SIZE, ) +from op_tests.triton_tests.test_gemm_afp4wfp4 import shuffle_scales, un_shuffle_scales torch.manual_seed(0) @@ -22,21 +23,39 @@ def rmsnorm(input, weight, eps=1e-6): return rms_norm -def calculate_target_w_torch(mat1, rms1_w, resid1, mat2, rms2_w, eps=1e-6): - orig_dtype = mat1.dtype - mat1 = mat1.to(torch.float32) +def calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, eps=1e-6, shuffle=False): + orig_dtype = x1.dtype + x1 = x1.to(torch.float32) rms1_w = rms1_w.to(torch.float32) - mat2 = mat2.to(torch.float32) - rms2_w = rms2_w.to(torch.float32) res1_out = None if resid1 is not None: resid1 = resid1.to(torch.float32) - mat1 = res1_out = mat1 + resid1 + x1 = res1_out = x1 + resid1 res1_out = res1_out.to(orig_dtype) - mat1 = rmsnorm(mat1, rms1_w, eps) - mat2 = rmsnorm(mat2, rms2_w, eps).to(orig_dtype) - q_fp4, q_scales = torch_dynamic_mxfp4_quant(mat1) - return (q_fp4, q_scales), mat2, res1_out + x1 = rmsnorm(x1, rms1_w, eps) + out1_fp4, out1_scale = torch_dynamic_mxfp4_quant(x1) + + out2 = None + if x2 is not None: + x2 = x2.to(torch.float32) + rms2_w = rms2_w.to(torch.float32) + out2 = rmsnorm(x2, rms2_w, eps).to(orig_dtype) + + if shuffle: + out1_scale_pad = out1_scale + M = out1_scale.shape[0] + N = x1.shape[1] + scaleM = (M + 255) // 256 * 256 + scaleN_valid = (N + 31) // 32 + scaleN = (scaleN_valid + 7) // 8 * 8 + out1_scale_pad = torch.empty( + (scaleM, scaleN), dtype=out1_scale.dtype, device=out1_scale.device + ) + out1_scale_pad[:M, :scaleN_valid] = out1_scale[:M, :scaleN_valid] + out1_scale = shuffle_scales(out1_scale_pad) + out1_scale = out1_scale.view(out1_scale.shape[0] * 32, -1) + + return (out1_fp4, out1_scale), out2, res1_out def convert_mxfp4_to_fp32(x, x_scales): @@ -48,25 +67,28 @@ def convert_mxfp4_to_fp32(x, x_scales): def generate_fused_rms_quant_data( - mat1_shape=(32, 1536), - mat1_stride=(2112, 1), - mat2_shape=(32, 512), - mat2_stride=(2112, 1), - residual=False, + x1_shape=(32, 1536), + x1_stride=(2112, 1), + x2_shape=(32, 512), + x2_stride=(2112, 1), + inp2=False, + res1=False, dtype=torch.bfloat16, ): - mat1 = torch.randn((mat1_shape[0], mat1_stride[0]), dtype=dtype, device="cuda") - mat1 = mat1[:, : mat1_shape[1]] - - mat2 = torch.randn((mat2_shape[0], mat2_stride[0]), dtype=dtype, device="cuda") - mat2 = mat2[:, : mat2_shape[1]] - - rms1_w = torch.randn(mat1.shape[1], dtype=dtype, device="cuda") - rms2_w = torch.randn(mat2.shape[1], dtype=dtype, device="cuda") + x1 = torch.randn((x1_shape[0], x1_stride[0]), dtype=dtype, device="cuda") + x1 = x1[:, : x1_shape[1]] + x2 = None + rms2_w = None + if inp2: + x2 = torch.randn((x2_shape[0], x2_stride[0]), dtype=dtype, device="cuda") + x2 = x2[:, : x2_shape[1]] + rms2_w = torch.randn(x2.shape[1], dtype=dtype, device="cuda") + + rms1_w = torch.randn(x1.shape[1], dtype=dtype, device="cuda") resid1 = None - if residual: - resid1 = torch.randn_like(mat1, dtype=dtype, device="cuda") - return mat1, mat2, rms1_w, rms2_w, resid1 + if res1: + resid1 = torch.randn_like(x1, dtype=dtype, device="cuda") + return x1, x2, rms1_w, rms2_w, resid1 @pytest.mark.parametrize("B", [1, 4, 16, 32, 1000, 10000]) @@ -85,54 +107,81 @@ def test_flatten_quant(B: int, M: int, N: int, dtype): torch.testing.assert_close(triton_out, torch_out) -@pytest.mark.parametrize("B", [1, 32, 256]) -@pytest.mark.parametrize("M", [128, 132, 2112]) -@pytest.mark.parametrize("N", [32, 96]) -@pytest.mark.parametrize("stride", [2112]) -@pytest.mark.parametrize("skip_second", [True, False]) -@pytest.mark.parametrize("residual", [True, False]) +@pytest.mark.parametrize( + "M, N1, N2, stride", + [ + (M, N1, N2, stride) + for M in [1, 4, 33, 64, 132, 256] # TODO: debug for 131072 + for N1, N2, stride in [ + (200, 200, 200), + (256, 256, 256), + (256, 256, 2112), + ] + ], +) +@pytest.mark.parametrize("inp2", [True, False]) +@pytest.mark.parametrize("res1", [True, False]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("shuffle", [True, False]) +@pytest.mark.parametrize("scale_shuffle_padding", [True, False]) def test_fused_rms_quant( - B: int, M: int, N: int, stride: int, skip_second: bool, residual: bool, dtype + M: int, + N1: int, + N2: int, + stride: int, + inp2: bool, + res1: bool, + dtype, + shuffle: bool, + scale_shuffle_padding: bool, ): torch.cuda.empty_cache() # Helps avoid hangs in large tests - - mat1, mat2, rms1_w, rms2_w, resid1 = generate_fused_rms_quant_data( - mat1_shape=(B, M), - mat2_shape=(B, N), - mat1_stride=(stride, 1), - mat2_stride=(stride, 1), - residual=residual, + x1, x2, rms1_w, rms2_w, resid1 = generate_fused_rms_quant_data( + x1_shape=(M, N1), + x2_shape=(M, N2), + x1_stride=(stride, 1), + x2_stride=(stride, 1), + inp2=inp2, + res1=res1, dtype=dtype, ) - (mat1_fp4_torch, mat1_scales_torch), mat2_torch, res1_out_torch = ( - calculate_target_w_torch(mat1, rms1_w, resid1, mat2, rms2_w) + (x1_fp4_torch, x1_scales_torch), x2_torch, res1_out_torch = ( + calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, shuffle=shuffle) + ) + + (x1_fp4_triton, x1_scales_triton), x2_triton, res1_out_triton = ( + fused_rms_mxfp4_quant( + x1, + rms1_w, + 1e-6, + x2, + rms2_w, + 1e-6, + resid1, + shuffle=shuffle, + scale_shuffle_padding=scale_shuffle_padding, + ) ) - if not skip_second: - if not residual: - (mat1_fp4_triton, mat1_scales_triton), mat2_triton = fused_rms_mxfp4_quant( - mat1, rms1_w, 1e-6, mat2, rms2_w, 1e-6, resid1 - ) - else: - (mat1_fp4_triton, mat1_scales_triton), mat2_triton, res1_out_triton = ( - fused_rms_mxfp4_quant(mat1, rms1_w, 1e-6, mat2, rms2_w, 1e-6, resid1) - ) - else: - if not residual: - (mat1_fp4_triton, mat1_scales_triton) = fused_rms_mxfp4_quant( - mat1, rms1_w, 1e-6, None, None, None, None - ) - else: - (mat1_fp4_triton, mat1_scales_triton), res1_out_triton = ( - fused_rms_mxfp4_quant(mat1, rms1_w, 1e-6, None, None, None, resid1) - ) - if not skip_second: - torch.testing.assert_close(mat2_torch, mat2_triton) - - if residual: + + if shuffle: + x1_scales_triton = un_shuffle_scales( + x1_scales_triton.view(x1_scales_triton.shape[0] // 32, -1) + ) + x1_scales_torch = un_shuffle_scales( + x1_scales_torch.view(x1_scales_torch.shape[0] // 32, -1) + ) + + scaleN_valid = (N1 + 31) // 32 + x1_scales_triton = x1_scales_triton[:M, :scaleN_valid] + x1_scales_torch = x1_scales_torch[:M, :scaleN_valid] + + if x2_triton is not None: + torch.testing.assert_close(x2_torch, x2_triton) + + if res1_out_triton is not None: torch.testing.assert_close(res1_out_torch, res1_out_triton) - res_fp32_torch = convert_mxfp4_to_fp32(mat1_fp4_torch, mat1_scales_torch) - res_fp32_triton = convert_mxfp4_to_fp32(mat1_fp4_triton, mat1_scales_triton) + res_fp32_torch = convert_mxfp4_to_fp32(x1_fp4_torch, x1_scales_torch) + res_fp32_triton = convert_mxfp4_to_fp32(x1_fp4_triton, x1_scales_triton) torch.testing.assert_close(res_fp32_torch, res_fp32_triton) diff --git a/op_tests/triton_tests/test_gemm_afp4wfp4.py b/op_tests/triton_tests/test_gemm_afp4wfp4.py index 1ae27efbe8..7f79d2c536 100644 --- a/op_tests/triton_tests/test_gemm_afp4wfp4.py +++ b/op_tests/triton_tests/test_gemm_afp4wfp4.py @@ -22,6 +22,17 @@ def shuffle_scales(scales: torch.Tensor): return scales_shuffled +def un_shuffle_scales(scales_shuffled: torch.Tensor): + scales = scales_shuffled.clone() + sm, sn = scales.shape + scales = scales.view(sm * 32, sn // 32) + sm, sn = scales.shape + scales = scales.view(sm // 32, sn // 8, 4, 16, 2, 2, 1) + scales = scales.permute(0, 5, 3, 1, 4, 2, 6).contiguous() + scales = scales.view(sm, sn) + return scales + + # Note this is specified by the HW and cannot be changed. SCALE_GROUP_SIZE = 32 @@ -67,10 +78,7 @@ def generate_gemm_afp4wfp4_inputs( w = w_low | w_high << 4 # Scale of 1.0 in e8m0, bias 127. - if M >= 32 and shuffle_scales_fg: - M_pad = (M + 255) // 256 * 256 - else: - M_pad = M + M_pad = (M + 255) // 256 * 256 x_scales = torch.randint( 124, 128, (K // SCALE_GROUP_SIZE, M_pad), dtype=torch.uint8, device="cuda" ) @@ -162,6 +170,10 @@ def get_x_vals(): x_vals += [(v, 16384, 53248) for v in [1, 8, 16, 32, 64, 128, 256]] x_vals += [(v, 18432, 16384) for v in [1, 8, 16, 32, 64, 128, 256]] x_vals += [(v, 16384, 16384) for v in [1, 8, 16, 32, 64, 128, 256]] + x_vals += [(v, 10240, 8192) for v in [1, 2, 4, 8, 16, 32, 64]] + x_vals += [(v, 8192, 8192) for v in [1, 2, 4, 8, 16, 32, 64]] + x_vals += [(v, 57344, 8192) for v in [1, 2, 4, 8, 16, 32, 64]] + x_vals += [(v, 8192, 28672) for v in [1, 2, 4, 8, 16, 32, 64]] x_vals += [(1, 1, 32)] # minimal case return x_vals @@ -269,11 +281,22 @@ def test_gemm_afp4_wfp4( if shuffle_scales_fg and shuffle_weight_fg: if output: triton_out = gemm_afp4wfp4_preshuffled_weight_scales( - x, w_triton, x_scales_triton, w_scales_triton, dtype, y + x, + w_triton, + x_scales_triton, + w_scales_triton, + dtype, + y, + use_aot=(dtype == torch.bfloat16 and layout == "TN"), ) else: triton_out = gemm_afp4wfp4_preshuffled_weight_scales( - x, w_triton, x_scales_triton, w_scales_triton, dtype + x, + w_triton, + x_scales_triton, + w_scales_triton, + dtype, + use_aot=(dtype == torch.bfloat16 and layout == "TN"), ) elif shuffle_scales_fg and not shuffle_weight_fg: if output: