Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions python/test/unit/language/test_compile_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,57 @@ def fp8_convert(src, dst):
src = ASTSource(fn=fp8_convert, signature={"src": "*fp32", "dst": "*fp8e5"}, constexprs={})
triton.compile(src, target=GPUTarget("cuda", 90, 32))
triton.compile(src, target=GPUTarget("cuda", 80, 32))


def test_sm_arch_from_capability():
"""Verify that sm_arch_from_capability generates correct arch strings.

Consumer Blackwell (sm_120, e.g. RTX 5070 Ti) must NOT get the "a" suffix.
Using sm_120a causes LLVM/ptxas to generate tensor memory instructions
that don't exist on consumer hardware, producing runtime segfaults.
"""
from triton.backends.nvidia.compiler import sm_arch_from_capability
# Pre-Hopper: no suffix
assert sm_arch_from_capability(80) == "sm_80"
assert sm_arch_from_capability(89) == "sm_89"
# Hopper datacenter: "a" suffix
assert sm_arch_from_capability(90) == "sm_90a"
# Blackwell datacenter: "a" suffix
assert sm_arch_from_capability(100) == "sm_100a"
# Consumer Blackwell: NO "a" suffix (critical for RTX 5070 Ti/5080/5090)
assert sm_arch_from_capability(120) == "sm_120"


def test_compile_only_sm120() -> None:
"""Verify that sm_120 (consumer Blackwell) compiles with correct target.

Uses a tl.dot kernel (not just elementwise) to exercise the matmul
pipeline and confirm that tensor memory / tcgen05 instructions are
NOT generated for consumer Blackwell, which lacks tensor memory.
"""

@triton.jit
def simple_dot(a_base, b_base, out):
SIZE: tl.constexpr = 64
a_ptr = a_base + tl.arange(0, SIZE)[:, None] * SIZE + tl.arange(0, SIZE)[None, :]
b_ptr = b_base + tl.arange(0, SIZE)[:, None] * SIZE + tl.arange(0, SIZE)[None, :]
a = tl.load(a_ptr)
b = tl.load(b_ptr)
c = tl.dot(a, b)
out_ptr = out + tl.arange(0, SIZE)[:, None] * SIZE + tl.arange(0, SIZE)[None, :]
tl.store(out_ptr, c)

k = triton.compile(
triton.compiler.ASTSource(fn=simple_dot, signature={"a_base": "*fp16", "b_base": "*fp16", "out": "*fp16"},
constexprs={}), target=GPUTarget("cuda", 120, 32))
ptx = k.asm["ptx"]
# Must target sm_120 (no "a" suffix)
assert ".target sm_120" in ptx
assert ".target sm_120a" not in ptx
# Matmul must NOT use tensor memory or tcgen05 (consumer Blackwell lacks tmem).
# This is the key assertion — sm_100 dot uses tcgen05/tmem, sm_120 must not.
assert "tcgen05" not in ptx
ttgir = k.asm["ttgir"]
assert "ttng.tmem_alloc" not in str(ttgir)
assert "ttng.tc_gen5_mma" not in str(ttgir)
assert k.asm["cubin"] != b""
11 changes: 9 additions & 2 deletions third_party/nvidia/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,15 @@ def file_hash(path):


def sm_arch_from_capability(capability: int):
# TODO: Handle non-"a" sms
suffix = "a" if capability >= 90 else ""
# The "a" suffix enables arch-accelerated features only available on
# specific GPU implementations:
# sm_90a — Hopper datacenter (H100, H200)
# sm_100a — Blackwell datacenter (B100, B200)
# Consumer Blackwell (sm_120, e.g. RTX 5070 Ti/5080/5090) does NOT
# have an "a" variant — using sm_120a causes invalid codegen (tensor
# memory instructions that don't exist on consumer hardware), leading
# to runtime segfaults.
suffix = "a" if capability >= 90 and capability != 120 else ""
return f"sm_{capability}{suffix}"


Expand Down
Loading