Skip to content

Commit 7442326

Browse files
authored
[GPU][AMD] Add support for address space reads (#67)
This commit enables the amd gpu process to read from different address spaces. We the following address spaces: * generic - flat address space * region - gds address space (not supported on MI300/MI350) * local - threadgroup shared memory address space * private_lane - thread local address space * private_wave - wave local address space These address spaces correspond to address spaces exposed for dwarf expressions. In order to read from the private_lane address space we need to keep the lane as part of the ThreadAMDGPU object. This is a straightforward modification to the existing code that creates the per-lane threads. Tests were added to exercise each of the exposed address spaces.
1 parent 70617ca commit 7442326

File tree

10 files changed

+423
-13
lines changed

10 files changed

+423
-13
lines changed

lldb/packages/Python/lldbsuite/test/tools/gpu/amdgpu_testcase.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,19 @@
22
import lldb
33
from lldbsuite.test import lldbutil
44
from lldbsuite.test.lldbtest import line_number
5+
from typing import List
56

67

78
class AmdGpuTestCaseBase(GpuTestCaseBase):
89
"""
910
Class that should be used by all python AMDGPU tests.
1011
"""
12+
1113
NO_DEBUG_INFO_TESTCASE = True
1214

1315
def run_to_gpu_breakpoint(
14-
self, source, gpu_bkpt_pattern, cpu_bkpt_pattern
15-
):
16+
self, source: str, gpu_bkpt_pattern: str, cpu_bkpt_pattern: str
17+
) -> List[lldb.SBThread]:
1618
"""Run the test executable unit it hits the provided GPU breakpoint.
1719
The CPU breakpoint is used as a stopping point to switch to the GPU target
1820
and set the GPU breakpoint on that target.
@@ -24,6 +26,12 @@ def run_to_gpu_breakpoint(
2426
)
2527
self.assertEqual(cpu_target, self.cpu_target)
2628

29+
gpu_bkpt_id = self.set_gpu_source_breakpoint(source, gpu_bkpt_pattern)
30+
31+
return self.continue_to_gpu_breakpoint(gpu_bkpt_id)
32+
33+
def set_gpu_source_breakpoint(self, source: str, gpu_bkpt_pattern: str) -> int:
34+
"""Set a breakpoint on the gpu target. Returns the breakpoint id."""
2735
# Switch to the GPU target so we can set a breakpoint.
2836
self.assertTrue(self.gpu_target.IsValid())
2937
self.select_gpu()
@@ -32,23 +40,47 @@ def run_to_gpu_breakpoint(
3240
# This might not yet resolve to a location so use -2 to not check
3341
# for the number of locations.
3442
line = line_number(source, gpu_bkpt_pattern)
35-
gpu_bkpt = lldbutil.run_break_set_by_file_and_line(
43+
return lldbutil.run_break_set_by_file_and_line(
3644
self, source, line, num_expected_locations=-2, loc_exact=False
3745
)
3846

47+
def continue_to_gpu_breakpoint(self, gpu_bkpt_id: int) -> List[lldb.SBThread]:
48+
"""Continues execution on the cpu and gpu until we hit the gpu breakpoint"""
3949
# Need to run these commands asynchronously to be able to switch targets.
4050
self.setAsync(True)
4151
listener = self.dbg.GetListener()
4252

4353
# Continue the GPU process.
4454
self.runCmd("c")
45-
lldbutil.expect_state_changes(self, listener, self.gpu_process, [lldb.eStateRunning])
55+
lldbutil.expect_state_changes(
56+
self, listener, self.gpu_process, [lldb.eStateRunning]
57+
)
4658

4759
# Continue the CPU process.
4860
self.select_cpu()
4961
self.runCmd("c")
50-
lldbutil.expect_state_changes(self, listener, self.cpu_process, [lldb.eStateRunning])
62+
lldbutil.expect_state_changes(
63+
self, listener, self.cpu_process, [lldb.eStateRunning]
64+
)
5165

5266
# GPU breakpoint should get hit.
53-
lldbutil.expect_state_changes(self, listener, self.gpu_process, [lldb.eStateStopped])
54-
return lldbutil.get_threads_stopped_at_breakpoint_id(self.gpu_process, gpu_bkpt)
67+
lldbutil.expect_state_changes(
68+
self, listener, self.gpu_process, [lldb.eStateStopped]
69+
)
70+
return lldbutil.get_threads_stopped_at_breakpoint_id(
71+
self.gpu_process, gpu_bkpt_id
72+
)
73+
74+
def continue_to_gpu_source_breakpoint(
75+
self, source: str, gpu_bkpt_pattern: str
76+
) -> List[lldb.SBThread]:
77+
"""
78+
Sets a gpu breakpoint set by source regex gpu_bkpt_pattern, continues the process, and deletes the breakpoint again.
79+
Otherwise the same as `continue_to_gpu_breakpoint`.
80+
Inspired by lldbutil.continue_to_source_breakpoint.
81+
"""
82+
gpu_bkpt_id = self.set_gpu_source_breakpoint(source, gpu_bkpt_pattern)
83+
gpu_threads = self.continue_to_gpu_breakpoint(gpu_bkpt_id)
84+
self.gpu_target.BreakpointDelete(gpu_bkpt_id)
85+
86+
return gpu_threads
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
HIP_SOURCES := aspace.hip
2+
3+
include Makefile.rules
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""
2+
Address space tests for the AMDGPU plugin.
3+
"""
4+
5+
import lldb
6+
import lldbsuite.test.lldbutil as lldbutil
7+
from lldbsuite.test.lldbtest import *
8+
from amdgpu_testcase import *
9+
10+
SOURCE = "aspace.hip"
11+
12+
# The following addresses are offsets from the start of the lane private stack
13+
# for the location where these kernel local variables are stored. Note that
14+
# these addresses depend on the stack layout by the compiler. They seem to be
15+
# pretty consistent across rocm versions, but be aware that these private stack
16+
# addresses could change.
17+
SIZE_STACK_OFFSET = 0x20
18+
IDX_STACK_OFFSET = 0x24
19+
20+
# The wave size is the number of lanes in a wave.
21+
WAVE_SIZE = 64
22+
23+
24+
class Location:
25+
"""Helper class to describe a location in memory with an expected value"""
26+
27+
def __init__(self, name, expected_value, size_in_bytes, address):
28+
self.name = name
29+
self.address = address
30+
if isinstance(expected_value, list):
31+
self.expected_bytes = b"".join(
32+
[n.to_bytes(size_in_bytes, "little") for n in expected_value]
33+
)
34+
self.size_in_bytes = size_in_bytes * len(expected_value)
35+
else:
36+
self.expected_bytes = expected_value.to_bytes(size_in_bytes, "little")
37+
self.size_in_bytes = size_in_bytes
38+
39+
40+
class AddressSpaceAmdGpuTestCase(AmdGpuTestCaseBase):
41+
def validate_memory_read(
42+
self, address_space: str, loc: Location, thread: lldb.SBThread
43+
):
44+
"""Helper function to validate memory read from an address space"""
45+
addr_spec = lldb.SBAddressSpec(loc.address, address_space, thread)
46+
error = lldb.SBError()
47+
data = self.gpu_process.ReadMemoryFromSpec(addr_spec, loc.size_in_bytes, error)
48+
self.assertTrue(
49+
error.Success(),
50+
f"{loc.name} reading from address space '{address_space}' failed: {str(error)}",
51+
)
52+
self.assertEqual(
53+
data,
54+
loc.expected_bytes,
55+
f"Data for Location {loc.name} does not match expected value",
56+
)
57+
58+
def validate_read_address_from_global_variable(
59+
self, address_space: str, location: Location
60+
):
61+
"""Helper function to check we can read from address space using an address from a global variable.
62+
The global variable lookup is done on the cpu side and its value is added to the current location
63+
address to allow easy offsetting from a global variable that represents an array.
64+
"""
65+
self.build()
66+
67+
lldbutil.run_to_source_breakpoint(
68+
self, "// CPU BREAKPOINT - BEFORE LAUNCH", lldb.SBFileSpec(SOURCE)
69+
)
70+
71+
# Find the variable in the CPU target.
72+
var = self.cpu_target.FindFirstGlobalVariable(location.name)
73+
self.assertTrue(
74+
var.IsValid(),
75+
f"{location.name} variable should be valid in the CPU target",
76+
)
77+
78+
# Get the address stored in the variable (it's a pointer memory).
79+
addr = var.GetValueAsUnsigned()
80+
self.assertNotEqual(addr, 0, f"{location.name} address should not be null")
81+
location.address += addr
82+
83+
# Continue executing to the gpu breakpoint.
84+
gpu_threads = self.continue_to_gpu_source_breakpoint(
85+
SOURCE,
86+
"// GPU BREAKPOINT",
87+
)
88+
self.assertNotEqual(None, gpu_threads, "GPU should be stopped at breakpoint")
89+
90+
# Switch back to GPU to read from generic address space
91+
self.select_gpu()
92+
93+
self.validate_memory_read(address_space, location, gpu_threads[0])
94+
95+
def run_to_first_gpu_breakpoint(self):
96+
"""Helper to run to common gpu breakpoint"""
97+
self.build()
98+
99+
gpu_threads = self.run_to_gpu_breakpoint(
100+
SOURCE, "// GPU BREAKPOINT", "// CPU BREAKPOINT - BEFORE LAUNCH"
101+
)
102+
self.assertNotEqual(None, gpu_threads, "GPU should be stopped at breakpoint")
103+
return gpu_threads
104+
105+
def test_generic(self):
106+
"""Test reading from the generic address space."""
107+
# Read from generic address space using the address stored in the device_output pointer.
108+
# The device_output variable is a pointer to device visible memory.
109+
# We expect the element at index 1 to be 2 (output[1] = shared_mem[1] = 1 * 2 = 2).
110+
location = Location(
111+
"device_output",
112+
expected_value=2,
113+
size_in_bytes=4,
114+
address=4, # Offset address by 4 bytes to get index 1.
115+
)
116+
self.validate_read_address_from_global_variable("generic", location)
117+
118+
def test_region(self):
119+
"""Test that we fail to read from the region address space. It is not supported on this architecture (MI300/MI350)."""
120+
self.run_to_first_gpu_breakpoint()
121+
122+
addr_spec = lldb.SBAddressSpec(0, "region")
123+
error = lldb.SBError()
124+
self.gpu_process.ReadMemoryFromSpec(addr_spec, 1, error)
125+
126+
self.assertFalse(error.Success(), "Read from region address space should fail")
127+
self.assertEqual(
128+
"AMD_DBGAPI_STATUS_ERROR: AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT_COMPATIBILITY",
129+
error.GetCString(),
130+
)
131+
132+
def test_local(self):
133+
"""Test that we can read from local memory."""
134+
gpu_threads = self.run_to_first_gpu_breakpoint()
135+
136+
# Check that we can read local memory locations.
137+
# We expect the element at index 3 to be 6 (shared_mem[3] = 3 * 2 = 6).
138+
location = Location("shared_mem", expected_value=6, size_in_bytes=4, address=12)
139+
self.validate_memory_read("local", location, gpu_threads[0])
140+
141+
def test_private_lane(self):
142+
"""Test that we can read from the private_lane address space."""
143+
gpu_threads = self.run_to_first_gpu_breakpoint()
144+
145+
# Check that we can read private_lane memory Locations.
146+
# These locations map to local variables on the private stack memory for
147+
# each lane. The `size` value is the input parameter to the kernel and
148+
# the `idx` value is the local variable in the kernel that stores the
149+
# threadIdx.x value. We check the `idx` variable twice since it should
150+
# have different values for lane 0 and lane 1.
151+
checks = [
152+
(
153+
Location(
154+
"size",
155+
expected_value=WAVE_SIZE,
156+
size_in_bytes=4,
157+
address=SIZE_STACK_OFFSET,
158+
),
159+
gpu_threads[0],
160+
),
161+
(
162+
Location(
163+
"idx", expected_value=0, size_in_bytes=4, address=IDX_STACK_OFFSET
164+
),
165+
gpu_threads[0],
166+
),
167+
(
168+
Location(
169+
"idx", expected_value=1, size_in_bytes=4, address=IDX_STACK_OFFSET
170+
),
171+
gpu_threads[1],
172+
),
173+
]
174+
175+
for location, thread in checks:
176+
self.validate_memory_read("private_lane", location, thread)
177+
178+
def test_private_wave(self):
179+
"""Test that we can read from the private_wave address space."""
180+
181+
# The private_wave address space has the unswizzled values for each lane.
182+
# This makes it easy for the debugger to read the value of one variable
183+
# for each lane as consecutive memory locations.
184+
#
185+
# The location below describes the first 3 lane values for the idx variable.
186+
# The memory offset is calculated finding the offset of the idx variable
187+
# for a lane and then multiplying by the wave size since the unswizzled
188+
# memory is laid out with the values for each lane in consecutive memory.
189+
addr = IDX_STACK_OFFSET * WAVE_SIZE
190+
191+
gpu_threads = self.run_to_first_gpu_breakpoint()
192+
location = Location(
193+
"idx[0:3]", expected_value=[0, 1, 2], size_in_bytes=4, address=addr
194+
)
195+
self.validate_memory_read("private_wave", location, gpu_threads[0])
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#include <cstdio>
2+
#include <hip/hip_runtime.h>
3+
4+
constexpr int error_exit_code = -1;
5+
#define HIP_CHECK(condition) \
6+
{ \
7+
const hipError_t error = condition; \
8+
if (error != hipSuccess) { \
9+
printf("An error encountered: \"%s\"" \
10+
" at %s:%d", \
11+
hipGetErrorString(error), __FILE__, __LINE__); \
12+
std::exit(error_exit_code); \
13+
} \
14+
}
15+
16+
constexpr int NUM_THREADS = 64;
17+
constexpr int SHARED_MEM_SIZE = NUM_THREADS;
18+
__global__ void address_space_demo_kernel(int *output, int size) {
19+
// Private variable (per-thread)
20+
int idx = threadIdx.x;
21+
22+
// Shared memory (local to thread block)
23+
// Initialize shared memory by first thread in block
24+
__shared__ int shared_mem[SHARED_MEM_SIZE];
25+
if (threadIdx.x == 0) {
26+
for (int i = 0; i < SHARED_MEM_SIZE; i++) {
27+
shared_mem[i] = i * 2;
28+
}
29+
}
30+
31+
__syncthreads();
32+
33+
if (idx < size) {
34+
output[idx] = shared_mem[idx];
35+
} // GPU BREAKPOINT
36+
}
37+
38+
int *host_output = nullptr;
39+
int *device_output = nullptr;
40+
41+
int main() {
42+
// Allocate and initialize host buffer.
43+
size_t output_size_in_bytes = NUM_THREADS * sizeof(int);
44+
host_output = (int*)malloc(output_size_in_bytes);
45+
memset(host_output, 0, output_size_in_bytes);
46+
47+
HIP_CHECK(
48+
hipMalloc(&device_output, NUM_THREADS * sizeof(int))); // CPU BREAKPOINT - BEFORE LAUNCH
49+
// Launch kernel with 1 block of 256 threads
50+
hipLaunchKernelGGL(address_space_demo_kernel, dim3(1), dim3(NUM_THREADS), 0,
51+
0, device_output, NUM_THREADS);
52+
HIP_CHECK(hipDeviceSynchronize());
53+
HIP_CHECK(hipMemcpy(host_output, device_output, NUM_THREADS * sizeof(int),
54+
hipMemcpyDeviceToHost));
55+
// Print some results
56+
for (int i = 0; i < 10; i++) {
57+
printf("output[%d] = %d\n", i, host_output[i]);
58+
}
59+
HIP_CHECK(hipFree(device_output));
60+
free(host_output);
61+
return 0;
62+
}

0 commit comments

Comments
 (0)