Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 39 additions & 7 deletions lldb/packages/Python/lldbsuite/test/tools/gpu/amdgpu_testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@
import lldb
from lldbsuite.test import lldbutil
from lldbsuite.test.lldbtest import line_number
from typing import List


class AmdGpuTestCaseBase(GpuTestCaseBase):
"""
Class that should be used by all python AMDGPU tests.
"""

NO_DEBUG_INFO_TESTCASE = True

def run_to_gpu_breakpoint(
self, source, gpu_bkpt_pattern, cpu_bkpt_pattern
):
self, source: str, gpu_bkpt_pattern: str, cpu_bkpt_pattern: str
) -> List[lldb.SBThread]:
"""Run the test executable unit it hits the provided GPU breakpoint.
The CPU breakpoint is used as a stopping point to switch to the GPU target
and set the GPU breakpoint on that target.
Expand All @@ -24,6 +26,12 @@ def run_to_gpu_breakpoint(
)
self.assertEqual(cpu_target, self.cpu_target)

gpu_bkpt_id = self.set_gpu_source_breakpoint(source, gpu_bkpt_pattern)

return self.continue_to_gpu_breakpoint(gpu_bkpt_id)

def set_gpu_source_breakpoint(self, source: str, gpu_bkpt_pattern: str) -> int:
"""Set a breakpoint on the gpu target. Returns the breakpoint id."""
# Switch to the GPU target so we can set a breakpoint.
self.assertTrue(self.gpu_target.IsValid())
self.select_gpu()
Expand All @@ -32,23 +40,47 @@ def run_to_gpu_breakpoint(
# This might not yet resolve to a location so use -2 to not check
# for the number of locations.
line = line_number(source, gpu_bkpt_pattern)
gpu_bkpt = lldbutil.run_break_set_by_file_and_line(
return lldbutil.run_break_set_by_file_and_line(
self, source, line, num_expected_locations=-2, loc_exact=False
)

def continue_to_gpu_breakpoint(self, gpu_bkpt_id: int) -> List[lldb.SBThread]:
"""Continues execution on the cpu and gpu until we hit the gpu breakpoint"""
# Need to run these commands asynchronously to be able to switch targets.
self.setAsync(True)
listener = self.dbg.GetListener()

# Continue the GPU process.
self.runCmd("c")
lldbutil.expect_state_changes(self, listener, self.gpu_process, [lldb.eStateRunning])
lldbutil.expect_state_changes(
self, listener, self.gpu_process, [lldb.eStateRunning]
)

# Continue the CPU process.
self.select_cpu()
self.runCmd("c")
lldbutil.expect_state_changes(self, listener, self.cpu_process, [lldb.eStateRunning])
lldbutil.expect_state_changes(
self, listener, self.cpu_process, [lldb.eStateRunning]
)

# GPU breakpoint should get hit.
lldbutil.expect_state_changes(self, listener, self.gpu_process, [lldb.eStateStopped])
return lldbutil.get_threads_stopped_at_breakpoint_id(self.gpu_process, gpu_bkpt)
lldbutil.expect_state_changes(
self, listener, self.gpu_process, [lldb.eStateStopped]
)
return lldbutil.get_threads_stopped_at_breakpoint_id(
self.gpu_process, gpu_bkpt_id
)

def continue_to_gpu_source_breakpoint(
self, source: str, gpu_bkpt_pattern: str
) -> List[lldb.SBThread]:
"""
Sets a gpu breakpoint set by source regex gpu_bkpt_pattern, continues the process, and deletes the breakpoint again.
Otherwise the same as `continue_to_gpu_breakpoint`.
Inspired by lldbutil.continue_to_source_breakpoint.
"""
gpu_bkpt_id = self.set_gpu_source_breakpoint(source, gpu_bkpt_pattern)
gpu_threads = self.continue_to_gpu_breakpoint(gpu_bkpt_id)
self.gpu_target.BreakpointDelete(gpu_bkpt_id)

return gpu_threads
3 changes: 3 additions & 0 deletions lldb/test/API/gpu/amd/address-space/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
HIP_SOURCES := aspace.hip

include Makefile.rules
209 changes: 209 additions & 0 deletions lldb/test/API/gpu/amd/address-space/TestAddressSpaceAmdGpuPlugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
"""
Address space tests for the AMDGPU plugin.
"""

import lldb
import lldbsuite.test.lldbutil as lldbutil
from lldbsuite.test.lldbtest import *
from amdgpu_testcase import *

SOURCE = "aspace.hip"

# The following addresses are offsets from the start of the lane private stack
# for the location where these kernel local variables are stored. Note that
# these addresses depend on the stack layout by the compiler. They seem to be
# pretty consistent across rocm versions, but be aware that these private stack
# addresses could change.
SIZE_STACK_OFFSET = 0x20
IDX_STACK_OFFSET = 0x24

# The wave size is the number of lanes in a wave.
WAVE_SIZE = 64


class Location:
"""Helper class to describe a location in memory with an expected value"""

def __init__(self, name, expected_value, size_in_bytes, address):
self.name = name
self.address = address
if isinstance(expected_value, list):
self.expected_bytes = b"".join(
[n.to_bytes(size_in_bytes, "little") for n in expected_value]
)
self.size_in_bytes = size_in_bytes * len(expected_value)
else:
self.expected_bytes = expected_value.to_bytes(size_in_bytes, "little")
self.size_in_bytes = size_in_bytes


class AddressSpaceAmdGpuTestCase(AmdGpuTestCaseBase):
def validate_memory_read(
self, address_space: str, loc: Location, thread: lldb.SBThread
):
"""Helper function to validate memory read from an address space"""
addr_spec = lldb.SBAddressSpec(loc.address, address_space, thread)
error = lldb.SBError()
data = self.gpu_process.ReadMemoryFromSpec(addr_spec, loc.size_in_bytes, error)
self.assertTrue(
error.Success(),
f"{loc.name} reading from address space '{address_space}' failed: {str(error)}",
)
self.assertEqual(
data,
loc.expected_bytes,
f"Data for Location {loc.name} does not match expected value",
)

def validate_read_address_from_global_variable(
self, address_space: str, location: Location
):
"""Helper function to check we can read from address space using an address from a global variable.
The global variable lookup is done on the cpu side and its value is added to the current location
address to allow easy offsetting from a global variable that represents an array.
"""
self.build()

lldbutil.run_to_source_breakpoint(
self, "// CPU BREAKPOINT - BEFORE LAUNCH", lldb.SBFileSpec(SOURCE)
)

# Find the variable in the CPU target.
var = self.cpu_target.FindFirstGlobalVariable(location.name)
self.assertTrue(
var.IsValid(),
f"{location.name} variable should be valid in the CPU target",
)

# Get the address stored in the variable (it's a pointer memory).
addr = var.GetValueAsUnsigned()
self.assertNotEqual(addr, 0, f"{location.name} address should not be null")
location.address += addr

# Continue executing to the gpu breakpoint.
gpu_threads = self.continue_to_gpu_source_breakpoint(
SOURCE,
"// GPU BREAKPOINT",
)
self.assertNotEqual(None, gpu_threads, "GPU should be stopped at breakpoint")

# Switch back to GPU to read from generic address space
self.select_gpu()

self.validate_memory_read(address_space, location, gpu_threads[0])

def run_to_first_gpu_breakpoint(self):
"""Helper to run to common gpu breakpoint"""
self.build()

gpu_threads = self.run_to_gpu_breakpoint(
SOURCE, "// GPU BREAKPOINT", "// CPU BREAKPOINT - BEFORE LAUNCH"
)
self.assertNotEqual(None, gpu_threads, "GPU should be stopped at breakpoint")
return gpu_threads

def test_global(self):
"""Test reading from the global address space."""
# Read from global address space using the address stored in the host_output pointer.
# The host_output variable is a pointer to global memory.
# We expect the element at index 1 to be 0 since the host_output buffer
# has not been updated at the point where we hit the gpu breakpoint.
location = Location(
"host_output",
expected_value=0,
size_in_bytes=4,
address=4, # Offset address by 4 bytes to get index 1.
)
self.validate_read_address_from_global_variable("global", location)

def test_generic(self):
"""Test reading from the generic address space."""
# Read from generic address space using the address stored in the device_output pointer.
# The device_output variable is a pointer to device visible memory.
# We expect the element at index 1 to be 2 (output[1] = shared_mem[1] = 1 * 2 = 2).
location = Location(
"device_output",
expected_value=2,
size_in_bytes=4,
address=4, # Offset address by 4 bytes to get index 1.
)
self.validate_read_address_from_global_variable("generic", location)

def test_region(self):
"""Test that we fail to read from the region address space. It is not supported on this architecture (MI300/MI350)."""
self.run_to_first_gpu_breakpoint()

addr_spec = lldb.SBAddressSpec(0, "region")
error = lldb.SBError()
self.gpu_process.ReadMemoryFromSpec(addr_spec, 1, error)

self.assertFalse(error.Success(), "Read from region address space should fail")
self.assertEqual(
"AMD_DBGAPI_STATUS_ERROR: AMD_DBGAPI_STATUS_ERROR_INVALID_ARGUMENT_COMPATIBILITY",
error.GetCString(),
)

def test_local(self):
"""Test that we can read from local memory."""
gpu_threads = self.run_to_first_gpu_breakpoint()

# Check that we can read local memory locations.
# We expect the element at index 3 to be 6 (shared_mem[3] = 3 * 2 = 6).
location = Location("shared_mem", expected_value=6, size_in_bytes=4, address=12)
self.validate_memory_read("local", location, gpu_threads[0])

def test_private_lane(self):
"""Test that we can read from the private_lane address space."""
gpu_threads = self.run_to_first_gpu_breakpoint()

# Check that we can read private_lane memory Locations.
# These locations map to local variables on the private stack memory for
# each lane. The `size` value is the input parameter to the kernel and
# the `idx` value is the local variable in the kernel that stores the
# threadIdx.x value. We check the `idx` variable twice since it should
# have different values for lane 0 and lane 1.
checks = [
(
Location(
"size",
expected_value=WAVE_SIZE,
size_in_bytes=4,
address=SIZE_STACK_OFFSET,
),
gpu_threads[0],
),
(
Location(
"idx", expected_value=0, size_in_bytes=4, address=IDX_STACK_OFFSET
),
gpu_threads[0],
),
(
Location(
"idx", expected_value=1, size_in_bytes=4, address=IDX_STACK_OFFSET
),
gpu_threads[1],
),
]

for location, thread in checks:
self.validate_memory_read("private_lane", location, thread)

def test_private_wave(self):
"""Test that we can read from the private_wave address space."""

# The private_wave address space has the unswizzled values for each lane.
# This makes it easy for the debugger to read the value of one variable
# for each lane as consecutive memory locations.
#
# The location below describes the first 3 lane values for the idx variable.
# The memory offset is calculated finding the offset of the idx variable
# for a lane and then multiplying by the wave size since the unswizzled
# memory is laid out with the values for each lane in consecutive memory.
addr = IDX_STACK_OFFSET * WAVE_SIZE

gpu_threads = self.run_to_first_gpu_breakpoint()
location = Location(
"idx[0:3]", expected_value=[0, 1, 2], size_in_bytes=4, address=addr
)
self.validate_memory_read("private_wave", location, gpu_threads[0])
62 changes: 62 additions & 0 deletions lldb/test/API/gpu/amd/address-space/aspace.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include <cstdio>
#include <hip/hip_runtime.h>

constexpr int error_exit_code = -1;
#define HIP_CHECK(condition) \
{ \
const hipError_t error = condition; \
if (error != hipSuccess) { \
printf("An error encountered: \"%s\"" \
" at %s:%d", \
hipGetErrorString(error), __FILE__, __LINE__); \
std::exit(error_exit_code); \
} \
}

constexpr int NUM_THREADS = 64;
constexpr int SHARED_MEM_SIZE = NUM_THREADS;
__global__ void address_space_demo_kernel(int *output, int size) {
// Private variable (per-thread)
int idx = threadIdx.x;

// Shared memory (local to thread block)
// Initialize shared memory by first thread in block
__shared__ int shared_mem[SHARED_MEM_SIZE];
if (threadIdx.x == 0) {
for (int i = 0; i < SHARED_MEM_SIZE; i++) {
shared_mem[i] = i * 2;
}
}

__syncthreads();

if (idx < size) {
output[idx] = shared_mem[idx];
} // GPU BREAKPOINT
}

int *host_output = nullptr;
int *device_output = nullptr;

int main() {
// Allocate and initialize host buffer.
size_t output_size_in_bytes = NUM_THREADS * sizeof(int);
host_output = (int*)malloc(output_size_in_bytes);
memset(host_output, 0, output_size_in_bytes);

HIP_CHECK(
hipMalloc(&device_output, NUM_THREADS * sizeof(int))); // CPU BREAKPOINT - BEFORE LAUNCH
// Launch kernel with 1 block of 256 threads
hipLaunchKernelGGL(address_space_demo_kernel, dim3(1), dim3(NUM_THREADS), 0,
0, device_output, NUM_THREADS);
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemcpy(host_output, device_output, NUM_THREADS * sizeof(int),
hipMemcpyDeviceToHost));
// Print some results
for (int i = 0; i < 10; i++) {
printf("output[%d] = %d\n", i, host_output[i]);
}
HIP_CHECK(hipFree(device_output));
free(host_output);
return 0;
}
Loading