Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
dda7cfd
support implicit broadcast
borontion Aug 15, 2025
fef6e35
expose async copy
borontion Aug 15, 2025
70872f5
add async load example
borontion Aug 15, 2025
8648892
add buffer load example
borontion Aug 15, 2025
bf42f72
update comments
borontion Aug 15, 2025
c7dc02c
fix tensor shape
borontion Aug 15, 2025
684dfae
expose other value for async copy
borontion Aug 15, 2025
50216d1
add async annotation
borontion Aug 15, 2025
775f90b
set random seed
borontion Aug 15, 2025
e706668
add comments
borontion Aug 15, 2025
8b04206
exclude async operations to cdna4
borontion Aug 16, 2025
3a2a10c
rename builtins
borontion Aug 16, 2025
8d32602
update constraints load to shared
borontion Aug 16, 2025
1197945
remove 2d constraint
borontion Aug 16, 2025
4293ae3
introduce relaxed shared memory descriptor
borontion Aug 16, 2025
670885b
update comments
borontion Aug 16, 2025
c62c073
update comments
borontion Aug 16, 2025
ca7fd35
update tests
borontion Aug 16, 2025
9f8cb11
update
borontion Aug 16, 2025
e014f6e
clean up
borontion Aug 16, 2025
140c1e0
address comments
borontion Aug 17, 2025
2db0ff1
fmt
borontion Aug 17, 2025
fb0a794
update comments
borontion Aug 17, 2025
816b780
address comments
borontion Aug 17, 2025
7501534
update description
borontion Aug 17, 2025
c306a94
update description
borontion Aug 17, 2025
0bcc33b
update description
borontion Aug 17, 2025
142ff86
address comments
borontion Aug 18, 2025
88090e0
update tests
borontion Aug 18, 2025
5ebd479
fix
borontion Aug 18, 2025
ac510fd
update description
borontion Aug 18, 2025
e7a6ba3
update description
borontion Aug 18, 2025
483ca39
update description
borontion Aug 18, 2025
41887b4
update description
borontion Aug 18, 2025
6e2aee8
update description
borontion Aug 18, 2025
ea3b653
Merge branch 'main' into amd-gluon-to-shared
borontion Aug 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions python/src/gluon_ir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,11 @@ void init_gluon_ir(py::module &&m) {
})
.def("create_async_copy_global_to_local",
[](GluonOpBuilder &self, Value smem, Value pointer, Value mask,
tt::CacheModifier cacheModifier,
Value other, tt::CacheModifier cacheModifier,
tt::EvictionPolicy evictionPolicy, bool isVolatile) {
self.create<ttg::AsyncCopyGlobalToLocalOp>(
pointer, smem, mask,
/*other*/ Value{}, cacheModifier, evictionPolicy, isVolatile);
pointer, smem, mask, other, cacheModifier, evictionPolicy,
isVolatile);
})
.def("create_async_copy_mbarrier_arrive",
[](GluonOpBuilder &self, Value mbarrier, bool incrementCount) {
Expand Down
38 changes: 38 additions & 0 deletions python/test/gluon/test_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import torch
import pytest
import re

import triton
import triton.language as tl
Expand All @@ -10,6 +11,7 @@
from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
from triton.experimental.gluon.language.nvidia.hopper import tma, fence_async_shared
from triton.experimental.gluon.language.nvidia import hopper
from triton.experimental.gluon.language.amd.cdna4 import async_copy as cdna4_async_copy
from triton.experimental.gluon.language.extra import libdevice


Expand Down Expand Up @@ -149,6 +151,42 @@ def test_warpgroup_mma(ASYNC):
torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)


@pytest.mark.skipif(not is_hip_cdna4(), reason="Requires CDNA4")
@pytest.mark.parametrize("use_buffer_load", [True, False])
def test_amd_direct_load_to_shared(use_buffer_load):

@gluon.jit
def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])

smem = ttgl.allocate_shared_memory(a_ptr.dtype.element_ty, [128, 16], shared)
offsets = ttgl.arange(0, 128, layout=ttgl.SliceLayout(1, blocked))[:, None] * 16 + \
ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, blocked))[None, :]
if use_buffer_load:
cdna4_async_copy.buffer_load_to_shared(smem, a_ptr, offsets)
else:
cdna4_async_copy.global_load_to_shared(smem, a_ptr + offsets)

cdna4_async_copy.async_wait(0)
a = cdna4_async_copy.load_shared_relaxed(smem, blocked)

ttgl.store(b_ptr + offsets, a)

torch.manual_seed(0)
a = torch.randn((128, 16), dtype=torch.float16, device='cuda')
b = torch.empty_like(a)
pgm = kernel[(1, )](a, b, use_buffer_load)

torch.testing.assert_close(a, b)
assert re.search(r'ttg\.local_load .* \{ttg\.amdgpu\.syncedViaAsyncWait = true\}', pgm.asm['ttgir'], re.MULTILINE)
if use_buffer_load:
assert re.search(r"buffer_load.*lds$", pgm.asm['amdgcn'], re.MULTILINE)
else:
assert re.search(r"global_load_lds", pgm.asm['amdgcn'], re.MULTILINE)
assert 'vmcnt(0)' in pgm.asm['amdgcn']


@pytest.mark.parametrize("M, N, K", [(32, 32, 16), (16, 16, 32)])
@pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
@pytest.mark.parametrize("num_warps", [4, 8])
Expand Down
239 changes: 231 additions & 8 deletions python/test/gluon/test_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from triton.experimental.gluon.language.nvidia.blackwell import mbarrier, tma, TensorMemoryLayout, async_copy
from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
from triton.experimental.gluon.language.amd import _layouts as amd_layouts
from triton.experimental.gluon.language.amd.cdna4 import async_copy as cdna4_async_copy
from triton.experimental.gluon.language.extra import libdevice

from triton._filecheck import filecheck_test, run_parser
Expand Down Expand Up @@ -1590,7 +1591,175 @@ def test_infer_layout_for_amd_mfma(target):
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_amd_load_shared_relaxed(target):

@gluon.jit
def kernel():
blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])

smem = ttgl.allocate_shared_memory(ttgl.float16, [128, 16], shared)
cdna4_async_copy.load_shared_relaxed(smem, blocked)

mod = run_parser(kernel, target=target)
expecttest.assert_expected_inline(
anonymize_ir(mod.str_nodebug()), """\
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @kernel() attributes {noinline = false} {
%0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
%1 = ttg.local_load %0 {ttg.amdgpu.syncedViaAsyncWait = true} : !ttg.memdesc<128x16xf16, #shared, #smem, mutable> -> tensor<128x16xf16, #blocked>
tt.return
}
}
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_amd_load_shared_relaxed_in_loop(target):

@gluon.jit
def kernel():
blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])

smem = ttgl.allocate_shared_memory(ttgl.float16, [128, 16], shared)
for i in range(10):
cdna4_async_copy.load_shared_relaxed(smem, blocked)

mod = run_parser(kernel, target=target)
expecttest.assert_expected_inline(
anonymize_ir(mod.str_nodebug()), """\
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @kernel() attributes {noinline = false} {
%0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
%c0_i32 = arith.constant 0 : i32
%c10_i32 = arith.constant 10 : i32
%c1_i32 = arith.constant 1 : i32
%1 = arith.bitcast %c0_i32 : i32 to i32
%2 = arith.bitcast %c10_i32 : i32 to i32
%3 = arith.bitcast %c1_i32 : i32 to i32
%4 = ub.poison : i32
scf.for %arg0 = %1 to %2 step %3 : i32 {
%5 = ttg.local_load %0 {ttg.amdgpu.syncedViaAsyncWait = true} : !ttg.memdesc<128x16xf16, #shared, #smem, mutable> -> tensor<128x16xf16, #blocked>
}
tt.return
}
}
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_amd_global_load_to_shared(target):

@gluon.jit
def kernel(ptr):
blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])

smem = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [128, 16], shared)
offsets = ttgl.arange(0, 128, layout=ttgl.SliceLayout(1, blocked))[:, None] * 16 + \
ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, blocked))[None, :]

cdna4_async_copy.global_load_to_shared(smem, ptr + offsets)
cdna4_async_copy.async_wait(0)

ptr = MockTensor(ttgl.float16)
mod = run_parser(kernel, *make_args(ptr), target=target)
expecttest.assert_expected_inline(
anonymize_ir(mod.str_nodebug()), """\
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
%1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked>
%c16_i32 = arith.constant 16 : i32
%c16_i32_0 = arith.constant 16 : i32
%cst = arith.constant dense<16> : tensor<128x1xi32, #blocked>
%3 = arith.muli %2, %cst : tensor<128x1xi32, #blocked>
%4 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
%5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked>
%6 = tt.broadcast %3 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked>
%7 = tt.broadcast %5 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked>
%8 = arith.addi %6, %7 : tensor<128x16xi32, #blocked>
%9 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x16x!tt.ptr<f16>, #blocked>
%10 = tt.addptr %9, %8 : tensor<128x16x!tt.ptr<f16>, #blocked>, tensor<128x16xi32, #blocked>
%11 = ttg.async_copy_global_to_local %10, %0 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
%12 = ttg.async_wait {num = 0 : i32}
tt.return
}
}
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_amd_global_load_to_shared_with_broadcast(target):

@gluon.jit
def kernel(ptr):
blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])

smem = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [128, 16], shared)
y_offset = ttgl.arange(0, 128, layout=ttgl.SliceLayout(1, blocked))
x_offset = ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, blocked))
offsets = y_offset[:, None] * 16 + x_offset[None, :]

mask = (y_offset < 64)[:, None]
other = tl.cast(0.0, ptr.dtype.element_ty)

cdna4_async_copy.global_load_to_shared(smem, ptr + offsets, mask, other)
cdna4_async_copy.async_wait(0)

ptr = MockTensor(ttgl.float16)
mod = run_parser(kernel, *make_args(ptr), target=target)
expecttest.assert_expected_inline(
anonymize_ir(mod.str_nodebug()), """\
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
%1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
%3 = tt.expand_dims %1 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked>
%c16_i32 = arith.constant 16 : i32
%c16_i32_0 = arith.constant 16 : i32
%cst = arith.constant dense<16> : tensor<128x1xi32, #blocked>
%4 = arith.muli %3, %cst : tensor<128x1xi32, #blocked>
%5 = tt.expand_dims %2 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked>
%6 = tt.broadcast %4 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked>
%7 = tt.broadcast %5 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked>
%8 = arith.addi %6, %7 : tensor<128x16xi32, #blocked>
%c64_i32 = arith.constant 64 : i32
%cst_1 = arith.constant dense<64> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%9 = arith.cmpi slt, %1, %cst_1 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi1, #blocked>
%cst_2 = arith.constant 0.000000e+00 : f32
%11 = arith.truncf %cst_2 : f32 to f16
%12 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x16x!tt.ptr<f16>, #blocked>
%13 = tt.addptr %12, %8 : tensor<128x16x!tt.ptr<f16>, #blocked>, tensor<128x16xi32, #blocked>
%14 = tt.broadcast %10 : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked>
%15 = tt.splat %11 : f16 -> tensor<128x16xf16, #blocked>
%16 = ttg.async_copy_global_to_local %13, %0 mask %14 other %15 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
%17 = ttg.async_wait {num = 0 : i32}
tt.return
}
}
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_buffer_load_to_shared(target):

@gluon.jit
Expand All @@ -1601,7 +1770,7 @@ def kernel(ptr):
dest = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [256], shared)
offsets = ttgl.arange(0, 256, layout=blocked)

ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets)
cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets)

ptr = MockTensor(ttgl.float32)
mod = run_parser(kernel, *make_args(ptr), target=target)
Expand All @@ -1621,7 +1790,61 @@ def kernel(ptr):
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_buffer_load_to_shared_with_broadcast(target):

@gluon.jit
def kernel(ptr):
blocked1: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 64], [4, 1], [1, 0])
shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])

dest = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [4, 64], shared)

y_index = ttgl.arange(0, 4, layout=ttgl.SliceLayout(1, blocked1))
x_index = ttgl.arange(0, 64, layout=ttgl.SliceLayout(0, blocked1))
offsets = y_index[:, None] * 64 + x_index[None, :]

mask = (y_index < 2)[:, None]
other = 0.0

cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, mask, other)

ptr = MockTensor(ttgl.float32)
mod = run_parser(kernel, *make_args(ptr), target=target)
expecttest.assert_expected_inline(
anonymize_ir(mod.str_nodebug()), """\
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%0 = ttg.local_alloc : () -> !ttg.memdesc<4x64xf32, #shared, #smem, mutable>
%1 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
%3 = tt.expand_dims %1 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked>
%c64_i32 = arith.constant 64 : i32
%c64_i32_0 = arith.constant 64 : i32
%cst = arith.constant dense<64> : tensor<4x1xi32, #blocked>
%4 = arith.muli %3, %cst : tensor<4x1xi32, #blocked>
%5 = tt.expand_dims %2 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
%6 = tt.broadcast %4 : tensor<4x1xi32, #blocked> -> tensor<4x64xi32, #blocked>
%7 = tt.broadcast %5 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked>
%8 = arith.addi %6, %7 : tensor<4x64xi32, #blocked>
%c2_i32 = arith.constant 2 : i32
%cst_1 = arith.constant dense<2> : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%9 = arith.cmpi slt, %1, %cst_1 : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
%10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<4xi1, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi1, #blocked>
%cst_2 = arith.constant 0.000000e+00 : f32
%11 = tt.broadcast %10 : tensor<4x1xi1, #blocked> -> tensor<4x64xi1, #blocked>
%cst_3 = arith.constant dense<0.000000e+00> : tensor<4x64xf32, #blocked>
%12 = amdgpu.buffer_load_to_local %arg0[%8] mask = %11 other = %cst_3 into %0 : <f32>[tensor<4x64xi32, #blocked>] tensor<4x64xf32, #blocked> -> <4x64xf32, #shared, #smem, mutable>
tt.return
}
}
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_buffer_load_to_shared_mask_other(target):

@gluon.jit
Expand All @@ -1634,7 +1857,7 @@ def kernel(ptr):

mask = ttgl.full([256], 1, ttgl.int1, layout=blocked)
other = ttgl.full([256], 0, ptr.dtype.element_ty, layout=blocked)
ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, mask, other)
cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, mask, other)

ptr = MockTensor(ttgl.float32)
mod = run_parser(kernel, *make_args(ptr), target=target)
Expand All @@ -1658,7 +1881,7 @@ def kernel(ptr):
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
def test_buffer_load_to_shared_cache_mods(target):

@gluon.jit
Expand All @@ -1669,9 +1892,9 @@ def kernel(ptr):
dest = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [256], shared)
offsets = ttgl.arange(0, 256, layout=blocked)

ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".ca")
ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cg")
ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cv")
cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".ca")
cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cg")
cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cv")

ptr = MockTensor(ttgl.float32)
mod = run_parser(kernel, *make_args(ptr), target=target)
Expand Down
Loading
Loading