Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions python/test/unit/intel/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,60 @@ def test_has_opencl_extension_error(device):
# Pass an invalid device_id (out of range) to trigger error
with pytest.raises(RuntimeError, match="Device is not found"):
driver.active.utils.has_opencl_extension(device_count, b"cl_khr_fp16")


@pytest.mark.parametrize("grf_mode, expect_retry, expect_fail",
[("default", True, False), # Should auto-retry with large GRF and succeed
("256", False, False), # Explicit large GRF — compiles on first attempt
("128", False, True), # Explicit small GRF — should fail, no retry
])
@pytest.mark.parametrize("generate_native_code", [False, True], ids=["load_binary", "make_zebin"])
def test_auto_grf_on_build_failure(device, monkeypatch, capfd, grf_mode, expect_retry, expect_fail,
generate_native_code):
"""Test GRF mode behavior for register-heavy kernels on both compilation paths:
- load_binary (generate_native_code=False): L0 runtime compilation via zeModuleCreate
- make_zebin (generate_native_code=True): offline compilation via ocloc
"""
monkeypatch.setenv("TRITON_DEBUG", "1")

@triton.jit
def _register_heavy_kernel(
output_ptr,
input_ptr,
q_ptr,
size,
BLOCK: tl.constexpr,
):
off = tl.arange(0, BLOCK)
mask = off < size
x = tl.load(input_ptr + off, mask=mask, other=0.0)
q = tl.load(q_ptr + off, mask=mask, other=float("-inf"))
result = tl.argmax(x / q, axis=-1)
tl.store(output_ptr, result)

BLOCK = 131072 # Large enough to exceed PTSS with default/small GRF
size = 128000

x = torch.randn(size, dtype=torch.float32, device=device)
q = torch.rand(size, dtype=torch.float32, device=device)
out = torch.empty(1, dtype=torch.int32, device=device)

if expect_fail:
with pytest.raises(RuntimeError):
_register_heavy_kernel[(1, )](out, x, q, size, BLOCK=BLOCK, grf_mode=grf_mode,
generate_native_code=generate_native_code)
else:
_register_heavy_kernel[(1, )](out, x, q, size, BLOCK=BLOCK, grf_mode=grf_mode,
generate_native_code=generate_native_code)

outs = capfd.readouterr().out
if expect_retry and not generate_native_code:
# load_binary path prints a retry message to stdout.
assert "retrying with large GRF mode" in outs or "recompiling the kernel using large GRF mode" in outs
elif expect_retry and generate_native_code:
# make_zebin path retries silently via ocloc — no stdout message.
# Success without exception is sufficient verification.
pass
else:
assert "retrying with large GRF mode" not in outs
assert "Build failed" not in outs
37 changes: 27 additions & 10 deletions third_party/intel/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,16 +495,33 @@ def make_zebin(cls, src, metadata, options):
ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt
subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
except subprocess.CalledProcessError as e:
if e.returncode == 255:
error = 'Internal Triton ZEBIN codegen error'
elif e.returncode == 128 + signal.SIGSEGV:
error = '`ocloc` raised SIGSEGV'
else:
error = f'`ocloc` failed with error code {e.returncode}'

raise RuntimeError(f'{error}\n'
f'`ocloc` stderr:\n{e.output}\n'
f'Repro command: {ocloc_cmd}\n') from e
# If GRF mode was not explicitly set, retry with large GRF mode
# before giving up. This handles cases where the default GRF mode
# doesn't provide enough registers (e.g., scratch space exceeds
# HW PTSS limit).
retry_succeeded = False
if options.grf_mode == 'default' and \
"-cl-intel-256-GRF-per-thread" not in metadata.get("build_flags", ""):
metadata["build_flags"] += " -cl-intel-256-GRF-per-thread"
ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt
try:
subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
retry_succeeded = True
except subprocess.CalledProcessError:
# Retry also failed — raise the original error below.
pass

if not retry_succeeded:
if e.returncode == 255:
error = 'Internal Triton ZEBIN codegen error'
elif e.returncode == 128 + signal.SIGSEGV:
error = '`ocloc` raised SIGSEGV'
else:
error = f'`ocloc` failed with error code {e.returncode}'

raise RuntimeError(f'{error}\n'
f'`ocloc` stderr:\n{e.output}\n'
f'Repro command: {ocloc_cmd}\n') from e

with open(fbin, 'rb') as f:
zebin = f.read()
Expand Down
49 changes: 46 additions & 3 deletions third_party/intel/backend/driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,12 +272,55 @@ extern "C" EXPORT_FUNC PyObject *load_binary(PyObject *args) {
auto [l0_module, l0_kernel, n_spills] =
compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device,
l0_context, build_flags(), is_spv);
if (PyErr_Occurred()) {
return NULL;
}

const bool debugEnabled = getBoolEnv("TRITON_DEBUG");

// If the initial compilation failed entirely (e.g., scratch space exceeds
// HW limit), and GRF mode was not explicitly set, retry with large GRF mode.
// This handles cases where the default GRF mode doesn't provide enough
// registers, causing the backend compiler to fail.
if (PyErr_Occurred() && is_spv && !build_flags.hasGRFSizeFlag()) {
// Save the original error before clearing it for the retry attempt.
PyObject *orig_type, *orig_value, *orig_tb;
PyErr_Fetch(&orig_type, &orig_value, &orig_tb);

if (debugEnabled)
std::cout << "(I): Build failed for \"" << kernel_name
<< "\", retrying with large GRF mode" << std::endl;

build_flags.addLargeGRFSizeFlag();

auto [l0_module_retry, l0_kernel_retry, n_spills_retry] =
compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device,
l0_context, build_flags(), is_spv);
if (PyErr_Occurred()) {
// Retry also failed — propagate the original error.
PyErr_Restore(orig_type, orig_value, orig_tb);
return NULL;
}

// Retry succeeded — discard the saved original error.
Py_XDECREF(orig_type);
Py_XDECREF(orig_value);
Py_XDECREF(orig_tb);

l0_module = l0_module_retry;
l0_kernel = l0_kernel_retry;
n_spills = n_spills_retry;

// Always print recovery message to stderr to follow up on the
// "L0 build module failed" error that was already printed.
std::cerr << "(I): Build failure recovered by retrying with large GRF "
Copy link
Copy Markdown
Contributor Author

@Egor-Krivov Egor-Krivov Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I write to stderr because stderr at this point already contains error message from IGC and we need to tell the user that issue is probably fixed

"mode for \""
<< kernel_name << "\"" << std::endl;

if (debugEnabled)
std::cout << "(I): Retry with large GRF succeeded, kernel has "
<< n_spills << " spills" << std::endl;
} else if (PyErr_Occurred()) {
return NULL;
}

if (is_spv) {
constexpr int32_t max_reg_spill = 1000;
const bool is_GRF_mode_specified = build_flags.hasGRFSizeFlag();
Expand Down