Skip to content
Closed
4 changes: 4 additions & 0 deletions python/test/unit/language/test_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ def test_mxfp(BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, nonKDim, NUM_WARPS, device)
pytest.skip("Scaled mxfp8 matmul is only natively supported on CDNA4")
if (nonKDim == 16 and BLOCK_K < 128) or (nonKDim == 32 and BLOCK_K < 64):
pytest.skip(f"CDNA4 does not support {BLOCK_K=} for scaled mfma {nonKDim=} variants")
if (BLOCK_M == 256 or BLOCK_N == 256) and BLOCK_K == 256:
pytest.skip("Config requires too much shared memory")

if BLOCK_N == 256 and BLOCK_K == 256:
NUM_STAGES = min(NUM_STAGES, 2)
Expand Down Expand Up @@ -1156,6 +1158,8 @@ def test_mxfp8_mxfp4_matmul(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, B_TR
pytest.skip(f"CDNA4 does not support {BLOCK_K=} for scaled mfma {nonKDim=} variants")
if (A_DATA_TYPE == 'float4' and not WITH_A_SCALE) or (B_DATA_TYPE == 'float4' and not WITH_B_SCALE):
pytest.skip("Float4 without scale is tested in test_block_scale_fp4")
if (BLOCK_M == 256 or BLOCK_N == 256) and BLOCK_K == 256:
pytest.skip("Config requires too much shared memory")
if not PACK_B_ALONG_K and B_DATA_TYPE != "float4":
pytest.skip("Pack along K can only be False for float4")
if BLOCK_N == 256 and BLOCK_K == 256:
Expand Down
2 changes: 1 addition & 1 deletion python/triton/knobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,8 +513,8 @@ class amd_knobs(base_knobs):
# We use strs so that we can have a default value based on other runtime info
use_block_pingpong: env_opt_bool = env_opt_bool("TRITON_HIP_USE_BLOCK_PINGPONG")
use_in_thread_transpose: env_opt_bool = env_opt_bool("TRITON_HIP_USE_IN_THREAD_TRANSPOSE")
use_async_copy: env_opt_bool = env_opt_bool("TRITON_HIP_USE_ASYNC_COPY")

use_async_copy: env_bool = env_bool("TRITON_HIP_USE_ASYNC_COPY")
scalarize_packed_fops: env_bool = env_bool("AMDGCN_SCALARIZE_PACKED_FOPS")


Expand Down
28 changes: 28 additions & 0 deletions test/TritonGPU/amd/amd-update-async-wait-count.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -487,3 +487,31 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
tt.return
}
}

// -----

// Test scf.if without else region in def chain

#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
// CHECK-LABEL: scf_if_without_else
tt.func public @scf_if_without_else(%arg1: !ttg.memdesc<128x16xf16, #shared, #smem, mutable>, %arg3: tensor<128x16x!tt.ptr<f16>, #blocked> {tt.divisibility = dense<[16, 16]> : tensor<2xi32>, tt.contiguity = dense<[16, 16]> : tensor<2xi32>}, %cond: i1) {
// Emits 1 direct to lds instruction
%0 = ttg.async_copy_global_to_local %arg3, %arg1 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
%1 = ttg.async_commit_group tokens %0

// For scf.if without else region, the else path contributes 0 instructions;
// so the minimum across both paths is 0.
scf.if %cond {
// Emits 1 direct to lds instruction inside the if
%inner = ttg.async_copy_global_to_local %arg3, %arg1 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
%inner_commit = ttg.async_commit_group tokens %inner
}

// CHECK: amdg.async_wait {{.*}} {num_inst = 0
%10 = ttg.async_wait %1 {num = 0 : i32}
tt.return
}
}
10 changes: 7 additions & 3 deletions third_party/amd/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@ def get_min_dot_size(target: GPUTarget):


def is_pingpong_schedule_enabled(arch, use_async_copy):
return (arch == "gfx942" or (arch == "gfx950" and use_async_copy is True)
) if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong
return (arch == "gfx942" or (arch == "gfx950" and use_async_copy is True)) \
if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong


def is_in_thread_transpose_enabled(arch):
return (arch == "gfx942") if knobs.amd.use_in_thread_transpose is None else knobs.amd.use_in_thread_transpose


def is_async_copy_enabled(arch):
return (arch in ["gfx950", "gfx1250"]) if knobs.amd.use_async_copy is None else knobs.amd.use_async_copy


@dataclass(frozen=True)
class HIPOptions:
num_warps: int = 4
Expand Down Expand Up @@ -227,7 +231,7 @@ def make_ttgir(mod, metadata, options):
passes.ttir.add_triton_licm(pm)
passes.common.add_canonicalizer(pm)

use_async_copy = knobs.amd.use_async_copy
use_async_copy = is_async_copy_enabled(options.arch)
use_block_pingpong = is_pingpong_schedule_enabled(options.arch, use_async_copy)

amd.passes.ttgpuir.add_schedule_loops(pm, options.num_stages)
Expand Down
8 changes: 5 additions & 3 deletions third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ int deduceMinCountBetweeOps(Operation *beginOp, Operation *endOp,
int count = 0;
for (auto op = beginOp; op != endOp; op = op->getNextNode()) {
if (auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
assert(!ifOp.getThenRegion().empty() && !ifOp.getElseRegion().empty());
assert(!ifOp.getThenRegion().empty());
auto minThen =
deduceMinCountInBlock(ifOp.getThenRegion().front(), countFunc);
auto minElse =
deduceMinCountInBlock(ifOp.getElseRegion().front(), countFunc);
int minElse = 0;
if (!ifOp.getElseRegion().empty())
minElse =
deduceMinCountInBlock(ifOp.getElseRegion().front(), countFunc);
count += std::min(minThen, minElse);
} else if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
if (std::optional<APInt> tripCount = forOp.getStaticTripCount()) {
Expand Down
Loading