Skip to content

Commit 2262feb

Browse files
committed
Merge branch 'integ-buff' of github.com:mateuszpn/llvm into integ-buff
Signed-off-by: Mateusz P. Nowak <[email protected]>
2 parents c8b075a + dac9ccd commit 2262feb

File tree

3 files changed

+92
-83
lines changed

3 files changed

+92
-83
lines changed

sycl/test-e2e/format.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,14 @@ def get_extra_env(sycl_devices):
334334
expanded = "env"
335335

336336
extra_env = get_extra_env([parsed_dev_name])
337+
backend, device = parsed_dev_name.split(":", 1)
338+
device_selector = parsed_dev_name
339+
if backend == "level_zero" and device.isdigit():
340+
# some local cfgs set ZE_AFFINITY_MASK to 0
341+
# override it if using different device,.
342+
extra_env.append(f"ZE_AFFINITY_MASK={device}")
343+
device_selector = f"{backend}:0"
344+
337345
if extra_env:
338346
expanded += " {}".format(" ".join(extra_env))
339347

@@ -343,7 +351,7 @@ def get_extra_env(sycl_devices):
343351
expanded += " env UR_LOADER_USE_LEVEL_ZERO_V2=0"
344352

345353
expanded += " ONEAPI_DEVICE_SELECTOR={} {}".format(
346-
parsed_dev_name, test.config.run_launcher
354+
device_selector, test.config.run_launcher
347355
)
348356
cmd = directive.command.replace("%{run}", expanded)
349357
# Expand device-specific condtions (%if ... %{ ... %}).

sycl/test-e2e/lit.cfg.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -917,12 +917,14 @@ def get_sycl_ls_verbose(sycl_device, env):
917917

918918
env = copy.copy(llvm_config.config.environment)
919919

920+
backend_for_selector = backend.replace("_v2", "").replace("_v1", "")
921+
920922
# Find all available devices under the backend
921-
env["ONEAPI_DEVICE_SELECTOR"] = backend + ":*"
923+
env["ONEAPI_DEVICE_SELECTOR"] = backend_for_selector + ":*"
922924

923925
detected_architectures = []
924926

925-
platform_devices = remove_level_zero_suffix(backend + ":*")
927+
platform_devices = backend_for_selector + ":*"
926928

927929
for line in get_sycl_ls_verbose(platform_devices, env).stdout.splitlines():
928930
if re.match(r" *Architecture:", line):
@@ -1112,6 +1114,7 @@ def get_sycl_ls_verbose(sycl_device, env):
11121114
features.update(sg_size_features)
11131115
features.update(architecture_feature)
11141116
features.update(device_family)
1117+
features.update(aspects)
11151118

11161119
be, dev = sycl_device.split(":")
11171120
features.add(dev.replace("fpga", "accelerator"))

unified-runtime/source/adapters/level_zero/v2/memory.cpp

Lines changed: 78 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
//
99
//===----------------------------------------------------------------------===//
1010

11-
#include "memory.hpp"
1211
#include "../ur_interface_loader.hpp"
1312
#include "context.hpp"
13+
#include "memory.hpp"
1414

1515
#include "../helpers/memory_helpers.hpp"
1616
#include "../image_common.hpp"
@@ -53,57 +53,69 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/,
5353
/* nop */
5454
}
5555

56+
static v2::raii::command_list_unique_handle
57+
getSyncCommandListForCopy(ur_context_handle_t hContext,
58+
ur_device_handle_t hDevice) {
59+
v2::command_list_desc_t listDesc;
60+
listDesc.IsInOrder = true;
61+
listDesc.Ordinal =
62+
hDevice
63+
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
64+
.ZeOrdinal;
65+
listDesc.CopyOffloadEnable = true;
66+
return hContext->getCommandListCache().getImmediateCommandList(
67+
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
68+
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
69+
}
70+
71+
static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
72+
ur_device_handle_t hDevice, void *dst,
73+
const void *src, size_t size) try {
74+
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
75+
76+
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
77+
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
78+
79+
return UR_RESULT_SUCCESS;
80+
} catch (...) {
81+
return exceptionToResult(std::current_exception());
82+
}
83+
5684
ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
5785
ur_context_handle_t hContext, void *hostPtr, size_t size,
5886
device_access_mode_t accessMode)
5987
: ur_mem_buffer_t(hContext, size, accessMode) {
60-
if (hostPtr) {
61-
// Host pointer provided - check if it's already USM or needs import
62-
ZeStruct<ze_memory_allocation_properties_t> memProps;
63-
auto ret =
64-
getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps);
65-
66-
if (ret == UR_RESULT_SUCCESS && memProps.type != ZE_MEMORY_TYPE_UNKNOWN) {
67-
// Already a USM allocation - just use it directly without import
68-
this->ptr = usm_unique_ptr_t(hostPtr, [](void *) {});
69-
return;
70-
}
71-
72-
// Not USM - try to import it
73-
bool hostPtrImported =
74-
maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated,
75-
hContext->getZeHandle(), hostPtr, size);
76-
77-
if (hostPtrImported) {
78-
// Successfully imported - use it with release
79-
this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) {
80-
ZeUSMImport.doZeUSMRelease(
81-
hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr);
82-
});
83-
// No copy-back needed for imported pointers
84-
return;
85-
}
86-
87-
// Import failed - allocate backing buffer and set up copy-back
88-
}
88+
bool hostPtrImported =
89+
maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated,
90+
hContext->getZeHandle(), hostPtr, size);
91+
92+
if (hostPtrImported) {
93+
this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) {
94+
ZeUSMImport.doZeUSMRelease(
95+
hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr);
96+
});
97+
} else {
98+
void *rawPtr;
99+
// Use HOST memory for integrated GPUs to enable zero-copy device access
100+
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
101+
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
89102

90-
// No host pointer, or import failed - allocate new USM host memory
91-
void *rawPtr;
92-
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
93-
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
103+
this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) {
104+
auto ret = hContext->getDefaultUSMPool()->free(ptr);
105+
if (ret != UR_RESULT_SUCCESS) {
106+
UR_LOG(ERR, "Failed to free host memory: {}", ret);
107+
}
108+
});
94109

95-
this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) {
96-
auto ret = hContext->getDefaultUSMPool()->free(ptr);
97-
if (ret != UR_RESULT_SUCCESS) {
98-
UR_LOG(ERR, "Failed to free host memory: {}", ret);
110+
if (hostPtr) {
111+
// Initial copy using Level Zero for USM HOST memory
112+
auto hDevice = hContext->getDevices()[0];
113+
UR_CALL_THROWS(
114+
synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size));
115+
// Set writeBackPtr to enable map/unmap copy-back (but NOT destructor
116+
// copy-back)
117+
writeBackPtr = hostPtr;
99118
}
100-
});
101-
102-
if (hostPtr) {
103-
// Copy data from user pointer to our backing buffer
104-
std::memcpy(this->ptr.get(), hostPtr, size);
105-
// Remember to copy back on destruction
106-
writeBackPtr = hostPtr;
107119
}
108120
}
109121

@@ -119,12 +131,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
119131
});
120132
}
121133

122-
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
123-
if (writeBackPtr) {
124-
std::memcpy(writeBackPtr, this->ptr.get(), size);
125-
}
126-
}
127-
128134
void *ur_integrated_buffer_handle_t::getDevicePtr(
129135
ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/,
130136
size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
@@ -140,7 +146,11 @@ void *ur_integrated_buffer_handle_t::mapHostPtr(
140146
void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset;
141147

142148
if (flags & UR_MAP_FLAG_READ) {
143-
std::memcpy(mappedPtr, ur_cast<char *>(ptr.get()) + offset, mapSize);
149+
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
150+
auto hDevice = hContext->getDevices()[0];
151+
UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr,
152+
ur_cast<char *>(ptr.get()) + offset,
153+
mapSize));
144154
}
145155

146156
// Track this mapping for unmap
@@ -172,8 +182,11 @@ void ur_integrated_buffer_handle_t::unmapHostPtr(
172182

173183
if (mappedRegion->flags &
174184
(UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) {
175-
std::memcpy(ur_cast<char *>(ptr.get()) + mappedRegion->offset,
176-
mappedRegion->ptr.get(), mappedRegion->size);
185+
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
186+
auto hDevice = hContext->getDevices()[0];
187+
UR_CALL_THROWS(synchronousZeCopy(
188+
hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset,
189+
mappedRegion->ptr.get(), mappedRegion->size));
177190
}
178191

179192
mappedRegions.erase(mappedRegion);
@@ -182,32 +195,11 @@ void ur_integrated_buffer_handle_t::unmapHostPtr(
182195
// No op for zero-copy path, memory is synced
183196
}
184197

185-
static v2::raii::command_list_unique_handle
186-
getSyncCommandListForCopy(ur_context_handle_t hContext,
187-
ur_device_handle_t hDevice) {
188-
v2::command_list_desc_t listDesc;
189-
listDesc.IsInOrder = true;
190-
listDesc.Ordinal =
191-
hDevice
192-
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
193-
.ZeOrdinal;
194-
listDesc.CopyOffloadEnable = true;
195-
return hContext->getCommandListCache().getImmediateCommandList(
196-
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
197-
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
198-
}
199-
200-
static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
201-
ur_device_handle_t hDevice, void *dst,
202-
const void *src, size_t size) try {
203-
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
204-
205-
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
206-
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
207-
208-
return UR_RESULT_SUCCESS;
209-
} catch (...) {
210-
return exceptionToResult(std::current_exception());
198+
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
199+
// Do NOT do automatic copy-back in destructor - it causes heap corruption
200+
// because writeBackPtr may be freed by SYCL runtime before buffer destructor
201+
// runs. Copy-back happens via explicit map/unmap operations (see
202+
// mapHostPtr/unmapHostPtr).
211203
}
212204

213205
void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
@@ -618,6 +610,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext,
618610
void *hostPtr = pProperties ? pProperties->pHost : nullptr;
619611
auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags);
620612

613+
// For integrated devices, use zero-copy host buffers. The integrated buffer
614+
// constructor will handle all cases:
615+
// 1. No host pointer - allocate USM host memory
616+
// 2. Host pointer is already USM - use directly
617+
// 3. Host pointer can be imported - import it
618+
// 4. Otherwise - allocate USM and copy-back on destruction
621619
if (useHostBuffer(hContext)) {
622620
*phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>(
623621
hContext, hostPtr, size, accessMode);

0 commit comments

Comments
 (0)