Skip to content
Closed
Changes from 2 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
80e793a
vulkan: avoid preferring transfer queue on AMD UMA devices
winstonma Apr 28, 2026
da5e315
Optimize Vulkan buffer transfers on UMA (Unified Memory Architecture)…
winstonma Apr 28, 2026
fe1eb03
vulkan: centralize UMA read heuristic and configurable threshold
winstonma Apr 28, 2026
bd5db36
fix incorrect async/event ordering on Vulkan, where a host read could…
winstonma May 1, 2026
91176d3
implement UMA write threshold to avoid non-cached memory penalty
winstonma May 2, 2026
dfcc950
implement an automatic calibration system for UMA (Unified Memory Arc…
winstonma May 2, 2026
8819a4f
refactoring and cleanup pass for this PR
winstonma May 2, 2026
f1c0532
reverses the decision criteria for using direct memory access when th…
winstonma May 2, 2026
4ecad4f
Merge commit 'refs/pull/22455/head' of https://github.com/ggml-org/ll…
winstonma May 2, 2026
3139dcf
fixes measurement bias that was causing suboptimal transfer strategy …
winstonma May 4, 2026
6133345
removes dead code structure in ggml_vk_buffer_read_2d_asyn
winstonma May 4, 2026
9309d72
Fixed the indentation inconsistency
winstonma May 4, 2026
bd7701b
added UMA write thresholding for ggml_vk_buffer_memset_async and ggml…
winstonma May 4, 2026
6f85dc0
revert prefers_transfer_queue definition and comments
winstonma May 4, 2026
8f2fb72
use read/write barrier to address potential race conditions
winstonma May 4, 2026
d38def3
cleanup and deduplication
winstonma May 4, 2026
e7db9e1
cleanup and deduplication #2
winstonma May 4, 2026
630716e
follows best practices for handling platform-specific size differences
winstonma May 4, 2026
5c78cdd
added a two-line comment explaining the contract
winstonma May 4, 2026
ab18b5a
Revert "fix incorrect async/event ordering on Vulkan"
winstonma May 4, 2026
b58976a
making the barrier a silent no-op
winstonma May 4, 2026
bd0a0ff
removed the premature UMA direct transfer check in ggml_vk_buffer_rea…
winstonma May 4, 2026
2aad038
fixing slow read speed
winstonma May 4, 2026
c52584d
fix calibration
winstonma May 5, 2026
e176a81
remove flush cache
winstonma May 5, 2026
8c67e77
refactor and optimize the calibration process
winstonma May 5, 2026
0ae5d6a
adjusted the read/write logic
winstonma May 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 114 additions & 21 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
#endif

#include <algorithm>
#include <cerrno>
#include <cmath>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <tuple>
Expand Down Expand Up @@ -106,6 +108,8 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }

#define GGML_VK_MAX_NODES 8192

static constexpr size_t GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT = 512 * 1024;

#define VK_CHECK(err, msg) \
do { \
vk::Result err_ = (err); \
Expand Down Expand Up @@ -2804,8 +2808,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal});
} else if (device->uma) {
// Fall back to host memory type
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
// Prefer host-visible device-local memory on UMA to maximize direct host access.
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else if (device->disable_host_visible_vidmem) {
if (device->allow_sysmem_fallback) {
Expand Down Expand Up @@ -6741,6 +6746,44 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si
}
}

static size_t ggml_vk_uma_non_cached_direct_read_threshold() {
static const size_t threshold = []() {
const char * threshold_env = getenv("GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD");
if (threshold_env == nullptr || threshold_env[0] == '\0') {
return GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT;
}

char * end = nullptr;
errno = 0;
const unsigned long long parsed = strtoull(threshold_env, &end, 10);
if (errno != 0 || end == threshold_env || *end != '\0') {
GGML_LOG_WARN("ggml_vulkan: invalid GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD='%s', using default %zu\n",
threshold_env,
GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT);
return GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT;
}

if (parsed > std::numeric_limits<size_t>::max()) {
GGML_LOG_WARN("ggml_vulkan: GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD='%s' exceeds size_t max (%zu), using default %zu\n",
threshold_env,
std::numeric_limits<size_t>::max(),
GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT);
return GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT;
}

return (size_t) parsed;
}();

return threshold;
}

static bool ggml_vk_use_uma_direct_read(vk_buffer & src, size_t copy_size) {
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);

const bool host_cached = (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCached) != vk::MemoryPropertyFlags{};
return host_cached || copy_size > ggml_vk_uma_non_cached_direct_read_threshold();
}

static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
GGML_ASSERT(!ggml_is_contiguous(tensor));
Expand Down Expand Up @@ -6840,6 +6883,19 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont

static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");

if (dst->device->uma && (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
if (width == spitch) {
deferred_memcpy((uint8_t *) dst->ptr + offset, src, width * height, &subctx->in_memcpys);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is correct for the same reasons I commented in #20018. The async copies need to run on the queue to stay in order with other commands.

@winstonma winstonma Apr 29, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. I am not familiar with these. I asked Codex to write a test case to verify the the async copies and passes the test case. And here is the follow up question that I asked:

Yes, the code is implemented to stay ordered with other backend work.

  1. In the UMA host-visible branch at if (dst->device->uma && (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)), the copy is not executed immediately. It is queued via deferred_memcpy into subctx in_memcpys.
  2. Those queued host writes are flushed only when the context is submitted, in ggml_vk_run_deferred_uploads and ggml_vk_submit_transfer_ctx.
  3. For compute-path submission, deferred uploads are run right before submit in ggml_vk_run_deferred_uploads(compute_ctx);. For transfer-path submission, same behavior is in ggml_vk_run_deferred_uploads(cpy_ctx);.
  4. The async tensor API routes into this path from ggml_backend_vk_set_tensor_async, so these copies participate in the same submission/sync chain as other backend commands.
  5. If transfer queue is enabled, cross-queue ordering is linked by timeline semaphore signal/wait in ctx->transfer_semaphore.value++;, and result->s->wait_semaphores.push_back(ctx->transfer_semaphore);.

So for the code specifically, ordering is preserved because writes are deferred and then flushed at queue-submit boundaries, not applied out-of-band.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to be familiar with it. Copy-pasting AI responses into maintainer questions is not allowed because we do not have time or patience to debate an AI that can make up wrong claims way faster than any human could debunk them.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Frankly I'm not quite sure I follow the question. But I tried to add some log and see if the question is being answered. This is the debug log:

❯ ./build-vk-debug/bin/llama-cli -m ~/model/gemma-4-E4B-it-UD-Q4_K_XL.gguf -p "Hello" -n 16 2>&1 | grep VK_TIMELINE_HANDSHAKE

Loading model... |VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=1 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=1 last_waited=0 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=2 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=2 last_waited=1 source=ggml_vk_synchronize                                                                        \VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=3 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=3 last_waited=2 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=4 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=4 last_waited=3 source=ggml_vk_synchronize                                                                         


▄▄ ▄▄
██ ██
██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
                                    ██    ██
                                    ▀▀    ▀▀

build      : b8960-fe1eb0302
model      : gemma-4-E4B-it-UD-Q4_K_XL.gguf
modalities : text

available commands:
  /exit or Ctrl+C     stop or exit
  /regen              regenerate the last response
  /clear              clear the chat history
  /read <file>        add a text file
  /glob <pattern>     add text files using globbing pattern


> Hello

|VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=5 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=5 last_waited=4 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=6 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=6 last_waited=5 source=ggml_vk_synchronize                                                                        -VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=7 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=7 last_waited=6 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=8 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=8 last_waited=7 source=ggml_vk_synchronize                                                                        HelloVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=9 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=9 last_waited=8 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=10 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=10 last_waited=9 source=ggml_vk_synchronize
!VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=11 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=11 last_waited=10 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=12 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=12 last_waited=11 source=ggml_vk_synchronize
 HowVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=13 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=13 last_waited=12 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=14 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=14 last_waited=13 source=ggml_vk_synchronize
 canVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=15 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=15 last_waited=14 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=16 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=16 last_waited=15 source=ggml_vk_synchronize
 IVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=17 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=17 last_waited=16 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=18 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=18 last_waited=17 source=ggml_vk_synchronize
 helpVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=19 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=19 last_waited=18 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=20 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=20 last_waited=19 source=ggml_vk_synchronize
 youVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=21 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=21 last_waited=20 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=22 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=22 last_waited=21 source=ggml_vk_synchronize
 todayVK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=23 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=23 last_waited=22 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=24 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=24 last_waited=23 source=ggml_vk_synchronize
?VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=25 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=25 last_waited=24 source=ggml_vk_synchronize
VK_TIMELINE_HANDSHAKE SIGNAL TQ->CQ: signal_value=26 source=ggml_vk_submit_transfer_ctx
VK_TIMELINE_HANDSHAKE WAIT_SUBMIT CQ<-TQ: wait_value=26 last_waited=25 source=ggml_vk_synchronize


[ Prompt: 71.1 t/s | Generation: 18.2 t/s ]

According to the log, the Vulkan Timeline Semaphore have created a system where the Compute Queue is physically incapable of outrunning the data being moved by the Transfer Queue. Thus the ordering is maintained. Also, the Compute Queue is hardware-blocked (bound by a Vulkan Timeline Semaphore wait operation) until the Transfer Queue signals completion, there is no risk of the GPU reading "stale" or partially written memory.

Disabling Transfer Queue on AMD UMA

I also submitted another PR to disable to the transfer queue on the AMD UMA. If the Transfer Queue is disabled, the code path would naturally fall back to a single-queue model where all operations are submitted to the Compute Queue. In this scenario, ordering is maintained by default due to the sequential nature of command submission within a single Vulkan queue.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regardless of the transfer queue or compute queue, ordering is maintained for commands you submit to the queue. That does not apply to deferred memcpys. in_memcpys run on queue submission. out_memcpys run (in specific cases) after a fence wait that makes sure all queue commands are done. This will not work with the backend async read/write functions because those assume that the commands run in the right order in the queue.

It may work in your tests because you get lucky and the order works out, but this is not guaranteed. This change is fundamentally unsafe.

@winstonma winstonma May 1, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for detailed explanation. I made a commit based on previous comment.

The commit moves the execution of out_memcpys is deferred from ggml_vk_compute_forward to ggml_vk_synchronize (ensures that the memcpy only occurs after the GPU fence has signaled completion). Also prevents dropped copies that might occur if a tensor's weak context reference was unset before the synchronization happened. Thus the ordering is enforced by the fence.

in_memcpys is consumed before GPU work submission, also it is cleared after submission (in ggml_vk_submit_transfer_ctx) instead of being cleared later during synchronization (avoid it from being re-executed). Thus ensure transfer complete before GPU work begins.

Conclusion

in_memcpys is executed before submit

In the UMA write path ggml_vk_buffer_write_2d_async, instead of going through a staging buffer + vkCmdCopyBuffer, the data is deferred into subctx->in_memcpys. These are then memcpy'd before ggml_vk_submit is called, at two sites:

  • ggml_vk_compute_forward : loops in_memcpysmemcpy → then submits the command buffer.
  • ggml_vk_synchronize: same pattern for any remaining in_memcpys on the compute context.
  • ggml_vk_submit_transfer_ctx: same for the transfer queue context.

out_memcpys executed after fence

In the UMA read path ggml_vk_buffer_read_2d_async, the read is deferred into subctx->out_memcpys. These are consumed only after waitForFences succeeds in ggml_vk_synchronize

fence signals GPU done
  → loop ctx->gc.contexts
    → for each tensor_ctx->out_memcpys: memcpy(dst, src, n)

} else {
for (size_t i = 0; i < height; i++) {
deferred_memcpy((uint8_t *) dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
}
}
return true;
}

// Check if src is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset = 0;
Expand Down Expand Up @@ -6946,6 +7002,27 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
GGML_ASSERT(height > 0);
GGML_ASSERT(src != nullptr);

if (src->device->uma && (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
const size_t copy_size = width * height;

if (ggml_vk_use_uma_direct_read(src, copy_size)) {
if (width == spitch && width == dpitch) {
deferred_memcpy(dst, (uint8_t *) src->ptr + offset, width * height, &subctx->out_memcpys);
} else {
for (size_t i = 0; i < height; i++) {
deferred_memcpy((uint8_t *) dst + i * dpitch, (uint8_t *) src->ptr + offset + i * spitch, width, &subctx->out_memcpys);
}
}
return true;
}

// For small non-cached UMA reads, skip direct mapped reads and force the GPU copy path.
// When async staging is not available, signal the caller to fall back to a sync path.
if (!sync_staging) {
return false;
}
}

// TODO: staging_offset is not used

// Check if dst is pinned memory
Expand Down Expand Up @@ -6983,15 +7060,15 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
}

// Fall back to staging buffer
const size_t copy_size = dpitch * height;
ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
const size_t staging_copy_size = dpitch * height;
ggml_vk_ensure_sync_staging_buffer(src->device, staging_copy_size);

vk_buffer& staging_buffer = src->device->sync_staging;

ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices);

deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
deferred_memcpy(dst, staging_buffer->ptr, staging_copy_size, &subctx->out_memcpys);
return true;
}

Expand All @@ -7006,26 +7083,27 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
// the HW device to host copy path.
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
if (ggml_vk_use_uma_direct_read(src, size)) {
memcpy(dst, (uint8_t *) src->ptr + offset, size);
return;
}
}

memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);

vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
GGML_ASSERT(ret);
ggml_vk_ctx_end(subctx);
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
GGML_ASSERT(ret);
ggml_vk_ctx_end(subctx);

ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);

for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
}

Expand Down Expand Up @@ -13844,6 +13922,13 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size);

if (!ret) {
if (ctx->device->uma && (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
GGML_ASSERT(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
ggml_vk_synchronize(ctx);
memcpy((uint8_t *) buf->ptr + dst_offset, data, size);
return;
}

ggml_vk_ensure_sync_staging_buffer(ctx, size);
ggml_vk_sync_buffers(nullptr, cpy_ctx);

Expand Down Expand Up @@ -13878,6 +13963,14 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_

// If that failed, copy synchronously through a staging buffer
if (!ret) {
if (ctx->device->uma && (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
if (ggml_vk_use_uma_direct_read(buf, size)) {
ggml_vk_synchronize(ctx);
memcpy(data, (uint8_t *) buf->ptr + src_offset, size);
return;
}
}

ggml_vk_ensure_sync_staging_buffer(ctx, size);
ggml_vk_sync_buffers(nullptr, compute_ctx);

Expand Down