diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index d4acee8b1df..d0495f6b375 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -29,7 +29,9 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #endif #include +#include #include +#include #include #include #include @@ -37,6 +39,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #include #include #include +#include #include #include #include @@ -106,6 +109,8 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } #define GGML_VK_MAX_NODES 8192 +static constexpr size_t GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT = 16 * 1024; + #define VK_CHECK(err, msg) \ do { \ vk::Result err_ = (err); \ @@ -186,6 +191,7 @@ typedef std::shared_ptr vk_device; typedef std::weak_ptr vk_device_ref; struct vk_buffer_struct; +static void ggml_vk_calibrate_uma_thresholds(vk_device& device); typedef std::shared_ptr vk_buffer; typedef std::weak_ptr vk_buffer_ref; @@ -615,6 +621,7 @@ struct vk_device_struct { uint32_t subgroup_size_log2; uint32_t shader_core_count; bool uma; + size_t uma_read_threshold; bool prefer_host_memory; bool float_controls_rte_fp16; bool subgroup_basic; @@ -2804,8 +2811,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal}); } else if (device->uma) { - // Fall back to host memory type - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, + // Prefer host-visible device-local memory on UMA to maximize direct host access. + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); } else if (device->disable_host_visible_vidmem) { if (device->allow_sysmem_fallback) { @@ -6099,6 +6107,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR"); vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor)); #endif + ggml_vk_calibrate_uma_thresholds(ctx->device); } static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { @@ -6638,12 +6647,61 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector w s.signal_semaphores = std::move(signal_semaphores); } +static void ggml_vk_record_host_barrier(vk_context& ctx, bool is_write) { + if (ctx->s == nullptr) { + return; + } + + const bool transfer_queue = ctx->p->q->transfer_only; + + if (is_write) { + if (ctx->in_memcpys.empty() && ctx->memsets.empty()) { + return; + } + ctx->s->buffer->buf.pipelineBarrier( + vk::PipelineStageFlagBits::eHost, + ctx->p->q->stage_flags, + {}, + { { + { vk::AccessFlagBits::eHostWrite }, + { transfer_queue ? vk::AccessFlagBits::eTransferRead : (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead) } + } }, + {}, + {} + ); + } else { + if (ctx->out_memcpys.empty()) { + return; + } + ctx->s->buffer->buf.pipelineBarrier( + ctx->p->q->stage_flags, + vk::PipelineStageFlagBits::eHost, + {}, + { { + { transfer_queue ? vk::AccessFlagBits::eTransferWrite : (vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite) }, + { vk::AccessFlagBits::eHostRead } + } }, + {}, + {} + ); + } +} + +static void ggml_vk_record_host_write_barrier(vk_context& ctx) { + ggml_vk_record_host_barrier(ctx, true); +} + +static void ggml_vk_record_host_read_barrier(vk_context& ctx) { + ggml_vk_record_host_barrier(ctx, false); +} + static void ggml_vk_ctx_end(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); if (ctx->s == nullptr) { return; } + ggml_vk_record_host_read_barrier(ctx); ctx->s->buffer->buf.end(); ctx->s = nullptr; } @@ -6686,11 +6744,19 @@ static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) { } vk_context cpy_ctx = ctx->transfer_ctx.lock(); - ggml_vk_ctx_end(cpy_ctx); for (auto& cpy : cpy_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } + for (auto& mset : cpy_ctx->memsets) { + memset(mset.dst, mset.val, mset.n); + } + ggml_vk_record_host_write_barrier(cpy_ctx); + ggml_vk_ctx_end(cpy_ctx); + + cpy_ctx->in_memcpys.clear(); + cpy_ctx->memsets.clear(); + cpy_ctx->out_memcpys.clear(); ctx->transfer_semaphore.value++; cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore); @@ -6721,26 +6787,226 @@ static void deferred_memset(void * dst, uint32_t val, size_t size, std::vectorsync_staging == nullptr || device->sync_staging->size < size) { +static void ggml_vk_ensure_sync_staging_buffer_internal(vk_device& device, vk_buffer* staging_ptr, size_t size) { + if (*staging_ptr == nullptr || (*staging_ptr)->size < size) { VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); - ggml_vk_destroy_buffer(device->sync_staging); - device->sync_staging = ggml_vk_create_buffer_check(device, size, + ggml_vk_destroy_buffer(*staging_ptr); + *staging_ptr = ggml_vk_create_buffer_check(device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } +static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) { + ggml_vk_ensure_sync_staging_buffer_internal(device, &device->sync_staging, size); +} + static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { - if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { - VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); - ggml_vk_destroy_buffer(ctx->sync_staging); - ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + ggml_vk_ensure_sync_staging_buffer_internal(ctx->device, &ctx->sync_staging, size); +} + +static std::optional ggml_vk_parse_uma_threshold(const char* env_var_name) { + const char * threshold_env = getenv(env_var_name); + if (threshold_env == nullptr || threshold_env[0] == '\0') { + return std::nullopt; + } + + char * end = nullptr; + errno = 0; + const unsigned long long parsed = strtoull(threshold_env, &end, 10); + if (errno != 0 || end == threshold_env || *end != '\0') { + GGML_LOG_WARN("ggml_vulkan: invalid %s='%s', falling back to benchmarked threshold\n", + env_var_name, threshold_env); + return std::nullopt; + } + + if constexpr (sizeof(size_t) < sizeof(unsigned long long)) { + if (parsed > std::numeric_limits::max()) { + GGML_LOG_WARN("ggml_vulkan: %s='%s' exceeds size_t max (%zu), falling back to benchmarked threshold\n", + env_var_name, threshold_env, std::numeric_limits::max()); + return std::nullopt; + } + } + + return (size_t) parsed; +} + +static size_t ggml_vk_uma_non_cached_direct_read_threshold(vk_device& device) { + static const std::optional cache = ggml_vk_parse_uma_threshold("GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD"); + return cache.value_or(device->uma_read_threshold); +} + + +template +static size_t ggml_vk_benchmark_uma_threshold( + const std::vector& sizes, + size_t default_threshold, + CPUOp cpu_op, + GPUOp gpu_op) +{ + if (sizes.empty()) { + return default_threshold; + } + + // Threshold semantics: copy_size <= threshold → use CPU direct path. + // We use binary search to find the largest size where CPU is at least as fast as GPU. + size_t low = 0; + size_t high = sizes.size() - 1; + size_t threshold = 0; + + while (low <= high) { + size_t mid = low + (high - low) / 2; + size_t size = sizes[mid]; + + const int iterations = 20; + std::vector cpu_times(iterations); + std::vector gpu_times(iterations); + + for (int i = 0; i < iterations; ++i) { + auto s_cpu = std::chrono::high_resolution_clock::now(); + cpu_op(size); + auto e_cpu = std::chrono::high_resolution_clock::now(); + cpu_times[i] = std::chrono::duration(e_cpu - s_cpu).count(); + + auto s_gpu = std::chrono::high_resolution_clock::now(); + gpu_op(size); + auto e_gpu = std::chrono::high_resolution_clock::now(); + gpu_times[i] = std::chrono::duration(e_gpu - s_gpu).count(); + } + + std::sort(cpu_times.begin(), cpu_times.end()); + std::sort(gpu_times.begin(), gpu_times.end()); + double cpu_time = cpu_times[iterations / 2]; + double gpu_time = gpu_times[iterations / 2]; + + if (cpu_time <= gpu_time) { + threshold = size; + low = mid + 1; + } else { + high = mid - 1; + } + } + + return threshold; +} + +static void ggml_vk_run_uma_benchmarks(vk_device& device) { + const std::vector benchmark_sizes = { + 4 * 1024, + 8 * 1024, + 16 * 1024, + 32 * 1024, + 64 * 1024, + 128 * 1024, + 256 * 1024, + 512 * 1024, + 1024 * 1024, + 2 * 1024 * 1024, + 4 * 1024 * 1024 + }; + + // Use a dedicated command pool for benchmarks to avoid interfering with the global transfer pool + vk_command_pool benchmark_pool; + benchmark_pool.init(device, &device->transfer_queue); + + // uma_direct: use the same allocation path as runtime tensor buffers so that + // calibration measures the same memory type (DeviceLocal|HostVisible|HostCoherent on UMA). + vk_buffer uma_direct = ggml_vk_create_buffer_device(device, benchmark_sizes.back()); + GGML_ASSERT(uma_direct->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible); + GGML_ASSERT(uma_direct->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + // staging: host-visible buffer used as the intermediate in the GPU staging path, + // mirroring ggml_vk_ensure_sync_staging_buffer which requests HostCached. + vk_buffer staging = ggml_vk_create_buffer(device, benchmark_sizes.back(), + {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + GGML_ASSERT(staging->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible); + GGML_ASSERT(staging->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + std::vector host_data(benchmark_sizes.back(), 0); + + // Warmup: exercise both the GPU copy path and the CPU memcpy path so that + // caches and driver state are in a representative steady state before timing. + // warmup_ctx is reused across iterations; ggml_vk_submit clears seqs after + // each submission, so ggml_vk_ctx_begin always starts with an empty context. + vk_context warmup_ctx = ggml_vk_create_temporary_context(benchmark_pool); + for (int i = 0; i < 5; ++i) { + // Warmup writes (host -> uma_direct). + memcpy(staging->ptr, host_data.data(), benchmark_sizes.back()); + ggml_vk_ctx_begin(device, warmup_ctx); + GGML_ASSERT(warmup_ctx->seqs.size() == 1); // seqs must be cleared by prior submit + vk::BufferCopy copy{ 0, 0, benchmark_sizes.back() }; + warmup_ctx->s->buffer->buf.copyBuffer(staging->buffer, uma_direct->buffer, {copy}); + ggml_vk_ctx_end(warmup_ctx); + ggml_vk_submit(warmup_ctx, device->fence); + (void)device->device.waitForFences({device->fence}, true, UINT64_MAX); + device->device.resetFences({device->fence}); + memcpy(uma_direct->ptr, host_data.data(), benchmark_sizes.back()); + + // Warmup reads (uma_direct -> host). + memcpy(host_data.data(), uma_direct->ptr, benchmark_sizes.back()); + ggml_vk_ctx_begin(device, warmup_ctx); + GGML_ASSERT(warmup_ctx->seqs.size() == 1); + vk::BufferCopy copy_read{ 0, 0, benchmark_sizes.back() }; + warmup_ctx->s->buffer->buf.copyBuffer(uma_direct->buffer, staging->buffer, {copy_read}); + ggml_vk_ctx_end(warmup_ctx); + ggml_vk_submit(warmup_ctx, device->fence); + (void)device->device.waitForFences({device->fence}, true, UINT64_MAX); + device->device.resetFences({device->fence}); + memcpy(host_data.data(), staging->ptr, benchmark_sizes.back()); + } + + + // Read threshold: compare CPU direct read (uma_direct -> host) against the + // real GPU staging path (GPU copyBuffer uma_direct -> staging + memcpy staging -> host). + if (!(uma_direct->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCached)) { + // Uncached MMIO reads: direct path only wins below the GPU staging fixed overhead crossover. + // On all tested hardware this is at or below 4KB. Hardcode rather than benchmark + // since memcpy destination cache warmth makes calibration unreliable for uncached sources. + device->uma_read_threshold = GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT; + } else { + device->uma_read_threshold = ggml_vk_benchmark_uma_threshold( + benchmark_sizes, + GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT, + [&](size_t size) { + // CPU direct path: read straight from the mapped UMA buffer. + memcpy(host_data.data(), uma_direct->ptr, size); + }, + [&](size_t size) { + // GPU staging path: GPU copies uma_direct -> staging, then memcpy out. + ggml_vk_ctx_begin(device, warmup_ctx); + vk::BufferCopy copy{ 0, 0, size }; + warmup_ctx->s->buffer->buf.copyBuffer(uma_direct->buffer, staging->buffer, {copy}); + ggml_vk_ctx_end(warmup_ctx); + ggml_vk_submit(warmup_ctx, device->fence); + (void)device->device.waitForFences({device->fence}, true, UINT64_MAX); + device->device.resetFences({device->fence}); + memcpy(host_data.data(), staging->ptr, size); + } + ); } + + ggml_vk_destroy_buffer(staging); + ggml_vk_destroy_buffer(uma_direct); + ggml_vk_command_pool_cleanup(device, benchmark_pool); + benchmark_pool.destroy(device->device); +} + +static void ggml_vk_calibrate_uma_thresholds(vk_device& device) { + if (!device->uma) return; + + ggml_vk_run_uma_benchmarks(device); + VK_LOG_DEBUG("ggml_vulkan: calibrated UMA read threshold: " << device->uma_read_threshold); +} + +static bool ggml_vk_use_uma_direct_read(vk_buffer & src, size_t copy_size) { + GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + const bool host_cached = (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCached) != vk::MemoryPropertyFlags{}; + return host_cached || copy_size <= ggml_vk_uma_non_cached_direct_read_threshold(src->device); } + static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); GGML_ASSERT(!ggml_is_contiguous(tensor)); @@ -6838,8 +7104,29 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } } +static bool ggml_vk_should_use_uma_direct_transfer(vk_buffer& buf, size_t size, bool is_write) { + if (!(buf->device->uma && (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible))) return false; + return is_write ? true : ggml_vk_use_uma_direct_read(buf, size); +} + +static void ggml_vk_deferred_memcpy_2d(void * dst, const void * src, size_t width, size_t height, size_t spitch, size_t dpitch, std::vector * list) { + if (width == spitch && width == dpitch) { + deferred_memcpy(dst, src, width * height, list); + } else { + for (size_t i = 0; i < height; i++) { + deferred_memcpy((uint8_t *)dst + i * dpitch, (const uint8_t *)src + i * spitch, width, list); + } + } +} + static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); + + if (ggml_vk_should_use_uma_direct_transfer(dst, width * height, true)) { + ggml_vk_deferred_memcpy_2d((uint8_t *) dst->ptr + offset, src, width, height, spitch, width, &subctx->in_memcpys); + return true; + } + // Check if src is pinned memory vk_buffer buf = nullptr; size_t buf_offset = 0; @@ -6928,6 +7215,8 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * memset(mset.dst, mset.val, mset.n); } + ggml_vk_record_host_write_barrier(subctx); + ggml_vk_submit(subctx, dst->device->fence); VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); dst->device->device.resetFences({ dst->device->fence }); @@ -6946,6 +7235,12 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size GGML_ASSERT(height > 0); GGML_ASSERT(src != nullptr); + const size_t copy_size = width * height; + if (ggml_vk_should_use_uma_direct_transfer(src, copy_size, false)) { + ggml_vk_deferred_memcpy_2d(dst, (uint8_t *) src->ptr + offset, width, height, spitch, dpitch, &subctx->out_memcpys); + return true; + } + // TODO: staging_offset is not used // Check if dst is pinned memory @@ -6983,15 +7278,15 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size } // Fall back to staging buffer - const size_t copy_size = dpitch * height; - ggml_vk_ensure_sync_staging_buffer(src->device, copy_size); + const size_t staging_copy_size = dpitch * height; + ggml_vk_ensure_sync_staging_buffer(src->device, staging_copy_size); vk_buffer& staging_buffer = src->device->sync_staging; ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices); - deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys); + deferred_memcpy(dst, staging_buffer->ptr, staging_copy_size, &subctx->out_memcpys); return true; } @@ -7002,30 +7297,28 @@ static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")"); - // If the device is not an UMA device the memory is host-accessible through rebar. While writing - // through PCIe is sufficient fast reading back data from PCIe is slower than going through - // the HW device to host copy path. - if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) { - GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); - + if (src->device->uma && + (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) && + ggml_vk_use_uma_direct_read(src, size)) { memcpy(dst, (uint8_t *) src->ptr + offset, size); - } else { - std::lock_guard guard(src->device->mutex); + return; + } - vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); - ggml_vk_ctx_begin(src->device, subctx); - bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true); - GGML_ASSERT(ret); - ggml_vk_ctx_end(subctx); + std::lock_guard guard(src->device->mutex); - ggml_vk_submit(subctx, src->device->fence); - VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); - src->device->device.resetFences({ src->device->fence }); - ggml_vk_queue_command_pools_cleanup(src->device); + vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); + ggml_vk_ctx_begin(src->device, subctx); + bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true); + GGML_ASSERT(ret); + ggml_vk_ctx_end(subctx); - for (auto& cpy : subctx->out_memcpys) { - memcpy(cpy.dst, cpy.src, cpy.n); - } + ggml_vk_submit(subctx, src->device->fence); + VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); + src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); + + for (auto& cpy : subctx->out_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); } } @@ -7067,8 +7360,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")"); - if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && - dst->device->uma) { + if (ggml_vk_should_use_uma_direct_transfer(dst, size, true)) { deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets); return; } @@ -7080,8 +7372,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && - dst->device->uma) { + if (ggml_vk_should_use_uma_direct_transfer(dst, size, true)) { memset((uint8_t*)dst->ptr + offset, c, size); return; } @@ -13425,6 +13716,8 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * memset(mset.dst, mset.val, mset.n); } + ggml_vk_record_host_write_barrier(subctx); + if (almost_ready && !ctx->almost_ready_fence_pending) { ggml_vk_submit(subctx, ctx->almost_ready_fence); ctx->almost_ready_fence_pending = true; @@ -13844,6 +14137,13 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size); if (!ret) { + if (ggml_vk_should_use_uma_direct_transfer(buf, size, true)) { + GGML_ASSERT(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + ggml_vk_synchronize(ctx); + memcpy((uint8_t *) buf->ptr + dst_offset, data, size); + return; + } + ggml_vk_ensure_sync_staging_buffer(ctx, size); ggml_vk_sync_buffers(nullptr, cpy_ctx); @@ -13878,6 +14178,12 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ // If that failed, copy synchronously through a staging buffer if (!ret) { + if (ggml_vk_should_use_uma_direct_transfer(buf, size, false)) { + ggml_vk_synchronize(ctx); + memcpy(data, (uint8_t *) buf->ptr + src_offset, size); + return; + } + ggml_vk_ensure_sync_staging_buffer(ctx, size); ggml_vk_sync_buffers(nullptr, compute_ctx); @@ -13971,11 +14277,14 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { cmd_buf = compute_ctx->s->buffer; } - ggml_vk_ctx_end(compute_ctx); - for (auto& cpy : compute_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } + for (auto& mset : compute_ctx->memsets) { + memset(mset.dst, mset.val, mset.n); + } + ggml_vk_record_host_write_barrier(compute_ctx); + ggml_vk_ctx_end(compute_ctx); ggml_vk_submit(compute_ctx, {}); ctx->submit_pending = true;