Skip to content

Commit f439401

Browse files
committed
Merge branch 'integ-buff' of github.com:mateuszpn/llvm into integ-buff
Signed-off-by: Mateusz P. Nowak <[email protected]>
2 parents c8b075a + dac9ccd commit f439401

File tree

3 files changed

+92
-82
lines changed

3 files changed

+92
-82
lines changed

sycl/test-e2e/format.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,14 @@ def get_extra_env(sycl_devices):
334334
expanded = "env"
335335

336336
extra_env = get_extra_env([parsed_dev_name])
337+
backend, device = parsed_dev_name.split(":", 1)
338+
device_selector = parsed_dev_name
339+
if backend == "level_zero" and device.isdigit():
340+
# some local cfgs set ZE_AFFINITY_MASK to 0
341+
# override it if using different device,.
342+
extra_env.append(f"ZE_AFFINITY_MASK={device}")
343+
device_selector = f"{backend}:0"
344+
337345
if extra_env:
338346
expanded += " {}".format(" ".join(extra_env))
339347

@@ -343,7 +351,7 @@ def get_extra_env(sycl_devices):
343351
expanded += " env UR_LOADER_USE_LEVEL_ZERO_V2=0"
344352

345353
expanded += " ONEAPI_DEVICE_SELECTOR={} {}".format(
346-
parsed_dev_name, test.config.run_launcher
354+
device_selector, test.config.run_launcher
347355
)
348356
cmd = directive.command.replace("%{run}", expanded)
349357
# Expand device-specific condtions (%if ... %{ ... %}).

sycl/test-e2e/lit.cfg.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -917,12 +917,14 @@ def get_sycl_ls_verbose(sycl_device, env):
917917

918918
env = copy.copy(llvm_config.config.environment)
919919

920+
backend_for_selector = backend.replace("_v2", "").replace("_v1", "")
921+
920922
# Find all available devices under the backend
921-
env["ONEAPI_DEVICE_SELECTOR"] = backend + ":*"
923+
env["ONEAPI_DEVICE_SELECTOR"] = backend_for_selector + ":*"
922924

923925
detected_architectures = []
924926

925-
platform_devices = remove_level_zero_suffix(backend + ":*")
927+
platform_devices = backend_for_selector + ":*"
926928

927929
for line in get_sycl_ls_verbose(platform_devices, env).stdout.splitlines():
928930
if re.match(r" *Architecture:", line):
@@ -1112,6 +1114,7 @@ def get_sycl_ls_verbose(sycl_device, env):
11121114
features.update(sg_size_features)
11131115
features.update(architecture_feature)
11141116
features.update(device_family)
1117+
features.update(aspects)
11151118

11161119
be, dev = sycl_device.split(":")
11171120
features.add(dev.replace("fpga", "accelerator"))

unified-runtime/source/adapters/level_zero/v2/memory.cpp

Lines changed: 78 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "memory.hpp"
12+
1213
#include "../ur_interface_loader.hpp"
1314
#include "context.hpp"
1415

@@ -53,57 +54,69 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/,
5354
/* nop */
5455
}
5556

57+
static v2::raii::command_list_unique_handle
58+
getSyncCommandListForCopy(ur_context_handle_t hContext,
59+
ur_device_handle_t hDevice) {
60+
v2::command_list_desc_t listDesc;
61+
listDesc.IsInOrder = true;
62+
listDesc.Ordinal =
63+
hDevice
64+
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
65+
.ZeOrdinal;
66+
listDesc.CopyOffloadEnable = true;
67+
return hContext->getCommandListCache().getImmediateCommandList(
68+
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
69+
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
70+
}
71+
72+
static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
73+
ur_device_handle_t hDevice, void *dst,
74+
const void *src, size_t size) try {
75+
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
76+
77+
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
78+
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
79+
80+
return UR_RESULT_SUCCESS;
81+
} catch (...) {
82+
return exceptionToResult(std::current_exception());
83+
}
84+
5685
ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
5786
ur_context_handle_t hContext, void *hostPtr, size_t size,
5887
device_access_mode_t accessMode)
5988
: ur_mem_buffer_t(hContext, size, accessMode) {
60-
if (hostPtr) {
61-
// Host pointer provided - check if it's already USM or needs import
62-
ZeStruct<ze_memory_allocation_properties_t> memProps;
63-
auto ret =
64-
getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps);
65-
66-
if (ret == UR_RESULT_SUCCESS && memProps.type != ZE_MEMORY_TYPE_UNKNOWN) {
67-
// Already a USM allocation - just use it directly without import
68-
this->ptr = usm_unique_ptr_t(hostPtr, [](void *) {});
69-
return;
70-
}
71-
72-
// Not USM - try to import it
73-
bool hostPtrImported =
74-
maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated,
75-
hContext->getZeHandle(), hostPtr, size);
76-
77-
if (hostPtrImported) {
78-
// Successfully imported - use it with release
79-
this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) {
80-
ZeUSMImport.doZeUSMRelease(
81-
hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr);
82-
});
83-
// No copy-back needed for imported pointers
84-
return;
85-
}
86-
87-
// Import failed - allocate backing buffer and set up copy-back
88-
}
89+
bool hostPtrImported =
90+
maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated,
91+
hContext->getZeHandle(), hostPtr, size);
92+
93+
if (hostPtrImported) {
94+
this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) {
95+
ZeUSMImport.doZeUSMRelease(
96+
hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr);
97+
});
98+
} else {
99+
void *rawPtr;
100+
// Use HOST memory for integrated GPUs to enable zero-copy device access
101+
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
102+
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
89103

90-
// No host pointer, or import failed - allocate new USM host memory
91-
void *rawPtr;
92-
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
93-
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
104+
this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) {
105+
auto ret = hContext->getDefaultUSMPool()->free(ptr);
106+
if (ret != UR_RESULT_SUCCESS) {
107+
UR_LOG(ERR, "Failed to free host memory: {}", ret);
108+
}
109+
});
94110

95-
this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) {
96-
auto ret = hContext->getDefaultUSMPool()->free(ptr);
97-
if (ret != UR_RESULT_SUCCESS) {
98-
UR_LOG(ERR, "Failed to free host memory: {}", ret);
111+
if (hostPtr) {
112+
// Initial copy using Level Zero for USM HOST memory
113+
auto hDevice = hContext->getDevices()[0];
114+
UR_CALL_THROWS(
115+
synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size));
116+
// Set writeBackPtr to enable map/unmap copy-back (but NOT destructor
117+
// copy-back)
118+
writeBackPtr = hostPtr;
99119
}
100-
});
101-
102-
if (hostPtr) {
103-
// Copy data from user pointer to our backing buffer
104-
std::memcpy(this->ptr.get(), hostPtr, size);
105-
// Remember to copy back on destruction
106-
writeBackPtr = hostPtr;
107120
}
108121
}
109122

@@ -119,12 +132,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
119132
});
120133
}
121134

122-
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
123-
if (writeBackPtr) {
124-
std::memcpy(writeBackPtr, this->ptr.get(), size);
125-
}
126-
}
127-
128135
void *ur_integrated_buffer_handle_t::getDevicePtr(
129136
ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/,
130137
size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
@@ -140,7 +147,11 @@ void *ur_integrated_buffer_handle_t::mapHostPtr(
140147
void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset;
141148

142149
if (flags & UR_MAP_FLAG_READ) {
143-
std::memcpy(mappedPtr, ur_cast<char *>(ptr.get()) + offset, mapSize);
150+
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
151+
auto hDevice = hContext->getDevices()[0];
152+
UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr,
153+
ur_cast<char *>(ptr.get()) + offset,
154+
mapSize));
144155
}
145156

146157
// Track this mapping for unmap
@@ -172,8 +183,11 @@ void ur_integrated_buffer_handle_t::unmapHostPtr(
172183

173184
if (mappedRegion->flags &
174185
(UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) {
175-
std::memcpy(ur_cast<char *>(ptr.get()) + mappedRegion->offset,
176-
mappedRegion->ptr.get(), mappedRegion->size);
186+
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
187+
auto hDevice = hContext->getDevices()[0];
188+
UR_CALL_THROWS(synchronousZeCopy(
189+
hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset,
190+
mappedRegion->ptr.get(), mappedRegion->size));
177191
}
178192

179193
mappedRegions.erase(mappedRegion);
@@ -182,32 +196,11 @@ void ur_integrated_buffer_handle_t::unmapHostPtr(
182196
// No op for zero-copy path, memory is synced
183197
}
184198

185-
static v2::raii::command_list_unique_handle
186-
getSyncCommandListForCopy(ur_context_handle_t hContext,
187-
ur_device_handle_t hDevice) {
188-
v2::command_list_desc_t listDesc;
189-
listDesc.IsInOrder = true;
190-
listDesc.Ordinal =
191-
hDevice
192-
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
193-
.ZeOrdinal;
194-
listDesc.CopyOffloadEnable = true;
195-
return hContext->getCommandListCache().getImmediateCommandList(
196-
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
197-
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
198-
}
199-
200-
static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
201-
ur_device_handle_t hDevice, void *dst,
202-
const void *src, size_t size) try {
203-
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
204-
205-
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
206-
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
207-
208-
return UR_RESULT_SUCCESS;
209-
} catch (...) {
210-
return exceptionToResult(std::current_exception());
199+
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
200+
// Do NOT do automatic copy-back in destructor - it causes heap corruption
201+
// because writeBackPtr may be freed by SYCL runtime before buffer destructor
202+
// runs. Copy-back happens via explicit map/unmap operations (see
203+
// mapHostPtr/unmapHostPtr).
211204
}
212205

213206
void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
@@ -618,6 +611,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext,
618611
void *hostPtr = pProperties ? pProperties->pHost : nullptr;
619612
auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags);
620613

614+
// For integrated devices, use zero-copy host buffers. The integrated buffer
615+
// constructor will handle all cases:
616+
// 1. No host pointer - allocate USM host memory
617+
// 2. Host pointer is already USM - use directly
618+
// 3. Host pointer can be imported - import it
619+
// 4. Otherwise - allocate USM and copy-back on destruction
621620
if (useHostBuffer(hContext)) {
622621
*phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>(
623622
hContext, hostPtr, size, accessMode);

0 commit comments

Comments
 (0)