Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6647,6 +6647,21 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
return true;
}

// UMA zero-copy: destination is directly mapped, skip staging buffer

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is correct. When called from set_tensor_async, I think the operation needs to be ordered against other work submitted to the same backend. When called from set_tensor, we already have a CPU copy path so this would be redundant.

if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
dst->device->uma) {
if (width == spitch) {
deferred_memcpy((uint8_t *)dst->ptr + offset, src, width * height, &subctx->in_memcpys);
} else {
for (size_t i = 0; i < height; i++) {
deferred_memcpy((uint8_t *)dst->ptr + offset + i * width,
(const uint8_t *)src + i * spitch, width, &subctx->in_memcpys);
}
}
return true;
}

VK_LOG_DEBUG("STAGING");

if (!sync_staging) {
Expand Down Expand Up @@ -6756,6 +6771,21 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size

return true;
}

// UMA zero-copy: source is directly mapped, skip staging buffer
if (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
src->device->uma) {
if (width == spitch && width == dpitch) {
deferred_memcpy(dst, (const uint8_t *)src->ptr + offset, width * height, &subctx->out_memcpys);
} else {
for (size_t i = 0; i < height; i++) {
deferred_memcpy((uint8_t *)dst + i * dpitch,
(const uint8_t *)src->ptr + offset + i * spitch, width, &subctx->out_memcpys);
}
}
return true;
}

VK_LOG_DEBUG("STAGING");

if (!sync_staging) {
Expand Down Expand Up @@ -14685,9 +14715,28 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev

ggml_vk_ctx_end(compute_ctx);

// Drain deferred H2D copies before submit (UMA zero-copy queues them here).
for (auto& cpy : compute_ctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
compute_ctx->in_memcpys.clear();

// Preserve D2H copies across context reset (drained by synchronize after fence).
auto preserved_out_memcpys = std::move(compute_ctx->out_memcpys);
const size_t preserved_count = preserved_out_memcpys.size();

ggml_vk_submit(compute_ctx, {vkev->fence});
ctx->submit_pending = true;
ctx->compute_ctx.reset();

// Restore D2H copies into fresh context for synchronize to drain.
if (!preserved_out_memcpys.empty()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->out_memcpys = std::move(preserved_out_memcpys);
GGML_ASSERT(compute_ctx->out_memcpys.size() == preserved_count);
}
}

static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
endif()
llama_build_and_test(test-gguf.cpp)
llama_build_and_test(test-backend-ops.cpp)
llama_build_and_test(test-vulkan-uma.cpp)

llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model")
Expand Down
Loading