diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index d4acee8b1df..d0495f6b375 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -29,7 +29,9 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #endif
 
 #include <algorithm>
+#include <cerrno>
 #include <cmath>
+#include <cstdlib>
 #include <iomanip>
 #include <iostream>
 #include <tuple>
@@ -37,6 +39,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #include <deque>
 #include <sstream>
 #include <utility>
+#include <optional>
 #include <memory>
 #include <limits>
 #include <map>
@@ -106,6 +109,8 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 
 #define GGML_VK_MAX_NODES 8192
 
+static constexpr size_t GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT = 16 * 1024;
+
 #define VK_CHECK(err, msg)                                          \
     do {                                                            \
         vk::Result err_ = (err);                                    \
@@ -186,6 +191,7 @@ typedef std::shared_ptr<vk_device_struct> vk_device;
 typedef std::weak_ptr<vk_device_struct> vk_device_ref;
 
 struct vk_buffer_struct;
+static void ggml_vk_calibrate_uma_thresholds(vk_device& device);
 typedef std::shared_ptr<vk_buffer_struct> vk_buffer;
 typedef std::weak_ptr<vk_buffer_struct> vk_buffer_ref;
 
@@ -615,6 +621,7 @@ struct vk_device_struct {
     uint32_t subgroup_size_log2;
     uint32_t shader_core_count;
     bool uma;
+    size_t uma_read_threshold;
     bool prefer_host_memory;
     bool float_controls_rte_fp16;
     bool subgroup_basic;
@@ -2804,8 +2811,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
             buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
                                                        vk::MemoryPropertyFlagBits::eDeviceLocal});
         } else if (device->uma) {
-            // Fall back to host memory type
-            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
+            // Prefer host-visible device-local memory on UMA to maximize direct host access.
+            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
+                                                       vk::MemoryPropertyFlagBits::eDeviceLocal,
                                                        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
         } else if (device->disable_host_visible_vidmem) {
             if (device->allow_sysmem_fallback) {
@@ -6099,6 +6107,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
     const char* output_tensor = getenv("GGML_VULKAN_OUTPUT_TENSOR");
     vk_output_tensor = (output_tensor == NULL ? 0 : atoi(output_tensor));
 #endif
+    ggml_vk_calibrate_uma_thresholds(ctx->device);
 }
 
 static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
@@ -6638,12 +6647,61 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> w
     s.signal_semaphores = std::move(signal_semaphores);
 }
 
+static void ggml_vk_record_host_barrier(vk_context& ctx, bool is_write) {
+    if (ctx->s == nullptr) {
+        return;
+    }
+
+    const bool transfer_queue = ctx->p->q->transfer_only;
+
+    if (is_write) {
+        if (ctx->in_memcpys.empty() && ctx->memsets.empty()) {
+            return;
+        }
+        ctx->s->buffer->buf.pipelineBarrier(
+            vk::PipelineStageFlagBits::eHost,
+            ctx->p->q->stage_flags,
+            {},
+            { {
+              { vk::AccessFlagBits::eHostWrite },
+              { transfer_queue ? vk::AccessFlagBits::eTransferRead : (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead) }
+            } },
+            {},
+            {}
+        );
+    } else {
+        if (ctx->out_memcpys.empty()) {
+            return;
+        }
+        ctx->s->buffer->buf.pipelineBarrier(
+            ctx->p->q->stage_flags,
+            vk::PipelineStageFlagBits::eHost,
+            {},
+            { {
+              { transfer_queue ? vk::AccessFlagBits::eTransferWrite : (vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite) },
+              { vk::AccessFlagBits::eHostRead }
+            } },
+            {},
+            {}
+        );
+    }
+}
+
+static void ggml_vk_record_host_write_barrier(vk_context& ctx) {
+    ggml_vk_record_host_barrier(ctx, true);
+}
+
+static void ggml_vk_record_host_read_barrier(vk_context& ctx) {
+    ggml_vk_record_host_barrier(ctx, false);
+}
+
 static void ggml_vk_ctx_end(vk_context& ctx) {
     VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
     if (ctx->s == nullptr) {
         return;
     }
 
+    ggml_vk_record_host_read_barrier(ctx);
     ctx->s->buffer->buf.end();
     ctx->s = nullptr;
 }
@@ -6686,11 +6744,19 @@ static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) {
     }
 
     vk_context cpy_ctx = ctx->transfer_ctx.lock();
-    ggml_vk_ctx_end(cpy_ctx);
 
     for (auto& cpy : cpy_ctx->in_memcpys) {
         memcpy(cpy.dst, cpy.src, cpy.n);
     }
+    for (auto& mset : cpy_ctx->memsets) {
+        memset(mset.dst, mset.val, mset.n);
+    }
+    ggml_vk_record_host_write_barrier(cpy_ctx);
+    ggml_vk_ctx_end(cpy_ctx);
+
+    cpy_ctx->in_memcpys.clear();
+    cpy_ctx->memsets.clear();
+    cpy_ctx->out_memcpys.clear();
 
     ctx->transfer_semaphore.value++;
     cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore);
@@ -6721,26 +6787,226 @@ static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<v
     }
 }
 
-static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
-    if (device->sync_staging == nullptr || device->sync_staging->size < size) {
+static void ggml_vk_ensure_sync_staging_buffer_internal(vk_device& device, vk_buffer* staging_ptr, size_t size) {
+    if (*staging_ptr == nullptr || (*staging_ptr)->size < size) {
         VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
-        ggml_vk_destroy_buffer(device->sync_staging);
-        device->sync_staging = ggml_vk_create_buffer_check(device, size,
+        ggml_vk_destroy_buffer(*staging_ptr);
+        *staging_ptr = ggml_vk_create_buffer_check(device, size,
             vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
             vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
     }
 }
 
+static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
+    ggml_vk_ensure_sync_staging_buffer_internal(device, &device->sync_staging, size);
+}
+
 static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
-    if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
-        VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
-        ggml_vk_destroy_buffer(ctx->sync_staging);
-        ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size,
-            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
-            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+    ggml_vk_ensure_sync_staging_buffer_internal(ctx->device, &ctx->sync_staging, size);
+}
+
+static std::optional<size_t> ggml_vk_parse_uma_threshold(const char* env_var_name) {
+    const char * threshold_env = getenv(env_var_name);
+    if (threshold_env == nullptr || threshold_env[0] == '\0') {
+        return std::nullopt;
+    }
+
+    char * end = nullptr;
+    errno = 0;
+    const unsigned long long parsed = strtoull(threshold_env, &end, 10);
+    if (errno != 0 || end == threshold_env || *end != '\0') {
+        GGML_LOG_WARN("ggml_vulkan: invalid %s='%s', falling back to benchmarked threshold\n",
+            env_var_name, threshold_env);
+        return std::nullopt;
+    }
+
+    if constexpr (sizeof(size_t) < sizeof(unsigned long long)) {
+        if (parsed > std::numeric_limits<size_t>::max()) {
+            GGML_LOG_WARN("ggml_vulkan: %s='%s' exceeds size_t max (%zu), falling back to benchmarked threshold\n",
+                env_var_name, threshold_env, std::numeric_limits<size_t>::max());
+            return std::nullopt;
+        }
+    }
+
+    return (size_t) parsed;
+}
+
+static size_t ggml_vk_uma_non_cached_direct_read_threshold(vk_device& device) {
+    static const std::optional<size_t> cache = ggml_vk_parse_uma_threshold("GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD");
+    return cache.value_or(device->uma_read_threshold);
+}
+
+
+template<typename CPUOp, typename GPUOp>
+static size_t ggml_vk_benchmark_uma_threshold(
+    const std::vector<size_t>& sizes,
+    size_t default_threshold,
+    CPUOp cpu_op,
+    GPUOp gpu_op)
+{
+    if (sizes.empty()) {
+        return default_threshold;
+    }
+
+    // Threshold semantics: copy_size <= threshold → use CPU direct path.
+    // We use binary search to find the largest size where CPU is at least as fast as GPU.
+    size_t low = 0;
+    size_t high = sizes.size() - 1;
+    size_t threshold = 0;
+
+    while (low <= high) {
+        size_t mid = low + (high - low) / 2;
+        size_t size = sizes[mid];
+
+        const int iterations = 20;
+        std::vector<double> cpu_times(iterations);
+        std::vector<double> gpu_times(iterations);
+
+        for (int i = 0; i < iterations; ++i) {
+            auto s_cpu = std::chrono::high_resolution_clock::now();
+            cpu_op(size);
+            auto e_cpu = std::chrono::high_resolution_clock::now();
+            cpu_times[i] = std::chrono::duration<double, std::micro>(e_cpu - s_cpu).count();
+
+            auto s_gpu = std::chrono::high_resolution_clock::now();
+            gpu_op(size);
+            auto e_gpu = std::chrono::high_resolution_clock::now();
+            gpu_times[i] = std::chrono::duration<double, std::micro>(e_gpu - s_gpu).count();
+        }
+
+        std::sort(cpu_times.begin(), cpu_times.end());
+        std::sort(gpu_times.begin(), gpu_times.end());
+        double cpu_time = cpu_times[iterations / 2];
+        double gpu_time = gpu_times[iterations / 2];
+
+        if (cpu_time <= gpu_time) {
+            threshold = size;
+            low = mid + 1;
+        } else {
+            high = mid - 1;
+        }
+    }
+
+    return threshold;
+}
+
+static void ggml_vk_run_uma_benchmarks(vk_device& device) {
+    const std::vector<size_t> benchmark_sizes = {
+        4 * 1024,
+        8 * 1024,
+        16 * 1024,
+        32 * 1024,
+        64 * 1024,
+        128 * 1024,
+        256 * 1024,
+        512 * 1024,
+        1024 * 1024,
+        2 * 1024 * 1024,
+        4 * 1024 * 1024
+    };
+
+    // Use a dedicated command pool for benchmarks to avoid interfering with the global transfer pool
+    vk_command_pool benchmark_pool;
+    benchmark_pool.init(device, &device->transfer_queue);
+
+    // uma_direct: use the same allocation path as runtime tensor buffers so that
+    // calibration measures the same memory type (DeviceLocal|HostVisible|HostCoherent on UMA).
+    vk_buffer uma_direct = ggml_vk_create_buffer_device(device, benchmark_sizes.back());
+    GGML_ASSERT(uma_direct->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible);
+    GGML_ASSERT(uma_direct->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    // staging: host-visible buffer used as the intermediate in the GPU staging path,
+    // mirroring ggml_vk_ensure_sync_staging_buffer which requests HostCached.
+    vk_buffer staging = ggml_vk_create_buffer(device, benchmark_sizes.back(),
+        {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
+         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
+    GGML_ASSERT(staging->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible);
+    GGML_ASSERT(staging->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    std::vector<uint8_t> host_data(benchmark_sizes.back(), 0);
+
+    // Warmup: exercise both the GPU copy path and the CPU memcpy path so that
+    // caches and driver state are in a representative steady state before timing.
+    // warmup_ctx is reused across iterations; ggml_vk_submit clears seqs after
+    // each submission, so ggml_vk_ctx_begin always starts with an empty context.
+    vk_context warmup_ctx = ggml_vk_create_temporary_context(benchmark_pool);
+    for (int i = 0; i < 5; ++i) {
+        // Warmup writes (host -> uma_direct).
+        memcpy(staging->ptr, host_data.data(), benchmark_sizes.back());
+        ggml_vk_ctx_begin(device, warmup_ctx);
+        GGML_ASSERT(warmup_ctx->seqs.size() == 1); // seqs must be cleared by prior submit
+        vk::BufferCopy copy{ 0, 0, benchmark_sizes.back() };
+        warmup_ctx->s->buffer->buf.copyBuffer(staging->buffer, uma_direct->buffer, {copy});
+        ggml_vk_ctx_end(warmup_ctx);
+        ggml_vk_submit(warmup_ctx, device->fence);
+        (void)device->device.waitForFences({device->fence}, true, UINT64_MAX);
+        device->device.resetFences({device->fence});
+        memcpy(uma_direct->ptr, host_data.data(), benchmark_sizes.back());
+
+        // Warmup reads (uma_direct -> host).
+        memcpy(host_data.data(), uma_direct->ptr, benchmark_sizes.back());
+        ggml_vk_ctx_begin(device, warmup_ctx);
+        GGML_ASSERT(warmup_ctx->seqs.size() == 1);
+        vk::BufferCopy copy_read{ 0, 0, benchmark_sizes.back() };
+        warmup_ctx->s->buffer->buf.copyBuffer(uma_direct->buffer, staging->buffer, {copy_read});
+        ggml_vk_ctx_end(warmup_ctx);
+        ggml_vk_submit(warmup_ctx, device->fence);
+        (void)device->device.waitForFences({device->fence}, true, UINT64_MAX);
+        device->device.resetFences({device->fence});
+        memcpy(host_data.data(), staging->ptr, benchmark_sizes.back());
+    }
+
+
+    // Read threshold: compare CPU direct read (uma_direct -> host) against the
+    // real GPU staging path (GPU copyBuffer uma_direct -> staging + memcpy staging -> host).
+    if (!(uma_direct->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCached)) {
+        // Uncached MMIO reads: direct path only wins below the GPU staging fixed overhead crossover.
+        // On all tested hardware this is at or below 4KB. Hardcode rather than benchmark
+        // since memcpy destination cache warmth makes calibration unreliable for uncached sources.
+        device->uma_read_threshold = GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT;
+    } else {
+        device->uma_read_threshold = ggml_vk_benchmark_uma_threshold(
+            benchmark_sizes,
+            GGML_VK_UMA_NON_CACHED_DIRECT_READ_THRESHOLD_DEFAULT,
+            [&](size_t size) {
+                // CPU direct path: read straight from the mapped UMA buffer.
+                memcpy(host_data.data(), uma_direct->ptr, size);
+            },
+            [&](size_t size) {
+                // GPU staging path: GPU copies uma_direct -> staging, then memcpy out.
+                ggml_vk_ctx_begin(device, warmup_ctx);
+                vk::BufferCopy copy{ 0, 0, size };
+                warmup_ctx->s->buffer->buf.copyBuffer(uma_direct->buffer, staging->buffer, {copy});
+                ggml_vk_ctx_end(warmup_ctx);
+                ggml_vk_submit(warmup_ctx, device->fence);
+                (void)device->device.waitForFences({device->fence}, true, UINT64_MAX);
+                device->device.resetFences({device->fence});
+                memcpy(host_data.data(), staging->ptr, size);
+            }
+        );
     }
+
+    ggml_vk_destroy_buffer(staging);
+    ggml_vk_destroy_buffer(uma_direct);
+    ggml_vk_command_pool_cleanup(device, benchmark_pool);
+    benchmark_pool.destroy(device->device);
+}
+
+static void ggml_vk_calibrate_uma_thresholds(vk_device& device) {
+    if (!device->uma) return;
+
+    ggml_vk_run_uma_benchmarks(device);
+    VK_LOG_DEBUG("ggml_vulkan: calibrated UMA read threshold: " << device->uma_read_threshold);
+}
+
+static bool ggml_vk_use_uma_direct_read(vk_buffer & src, size_t copy_size) {
+    GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    const bool host_cached = (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCached) != vk::MemoryPropertyFlags{};
+    return host_cached || copy_size <= ggml_vk_uma_non_cached_direct_read_threshold(src->device);
 }
 
+
 static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
     VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
     GGML_ASSERT(!ggml_is_contiguous(tensor));
@@ -6838,8 +7104,29 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
     }
 }
 
+static bool ggml_vk_should_use_uma_direct_transfer(vk_buffer& buf, size_t size, bool is_write) {
+    if (!(buf->device->uma && (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible))) return false;
+    return is_write ? true : ggml_vk_use_uma_direct_read(buf, size);
+}
+
+static void ggml_vk_deferred_memcpy_2d(void * dst, const void * src, size_t width, size_t height, size_t spitch, size_t dpitch, std::vector<vk_staging_memcpy> * list) {
+    if (width == spitch && width == dpitch) {
+        deferred_memcpy(dst, src, width * height, list);
+    } else {
+        for (size_t i = 0; i < height; i++) {
+            deferred_memcpy((uint8_t *)dst + i * dpitch, (const uint8_t *)src + i * spitch, width, list);
+        }
+    }
+}
+
 static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
     VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
+
+    if (ggml_vk_should_use_uma_direct_transfer(dst, width * height, true)) {
+        ggml_vk_deferred_memcpy_2d((uint8_t *) dst->ptr + offset, src, width, height, spitch, width, &subctx->in_memcpys);
+        return true;
+    }
+
     // Check if src is pinned memory
     vk_buffer buf = nullptr;
     size_t buf_offset = 0;
@@ -6928,6 +7215,8 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
             memset(mset.dst, mset.val, mset.n);
         }
 
+        ggml_vk_record_host_write_barrier(subctx);
+
         ggml_vk_submit(subctx, dst->device->fence);
         VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
         dst->device->device.resetFences({ dst->device->fence });
@@ -6946,6 +7235,12 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
     GGML_ASSERT(height > 0);
     GGML_ASSERT(src != nullptr);
 
+    const size_t copy_size = width * height;
+    if (ggml_vk_should_use_uma_direct_transfer(src, copy_size, false)) {
+        ggml_vk_deferred_memcpy_2d(dst, (uint8_t *) src->ptr + offset, width, height, spitch, dpitch, &subctx->out_memcpys);
+        return true;
+    }
+
     // TODO: staging_offset is not used
 
     // Check if dst is pinned memory
@@ -6983,15 +7278,15 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
     }
 
     // Fall back to staging buffer
-    const size_t copy_size = dpitch * height;
-    ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
+    const size_t staging_copy_size = dpitch * height;
+    ggml_vk_ensure_sync_staging_buffer(src->device, staging_copy_size);
 
     vk_buffer& staging_buffer = src->device->sync_staging;
 
     ggml_vk_sync_buffers(nullptr, subctx);
     subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices);
 
-    deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
+    deferred_memcpy(dst, staging_buffer->ptr, staging_copy_size, &subctx->out_memcpys);
     return true;
 }
 
@@ -7002,30 +7297,28 @@ static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t
 static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
     VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
 
-    // If the device is not an UMA device the memory is host-accessible through rebar. While writing
-    // through PCIe is sufficient fast reading back data from PCIe is slower than going through
-    // the HW device to host copy path.
-    if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
-        GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
-
+    if (src->device->uma &&
+        (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) &&
+        ggml_vk_use_uma_direct_read(src, size)) {
         memcpy(dst, (uint8_t *) src->ptr + offset, size);
-    } else {
-        std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
+        return;
+    }
 
-        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
-        ggml_vk_ctx_begin(src->device, subctx);
-        bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
-        GGML_ASSERT(ret);
-        ggml_vk_ctx_end(subctx);
+    std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
 
-        ggml_vk_submit(subctx, src->device->fence);
-        VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
-        src->device->device.resetFences({ src->device->fence });
-        ggml_vk_queue_command_pools_cleanup(src->device);
+    vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
+    ggml_vk_ctx_begin(src->device, subctx);
+    bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
+    GGML_ASSERT(ret);
+    ggml_vk_ctx_end(subctx);
 
-        for (auto& cpy : subctx->out_memcpys) {
-            memcpy(cpy.dst, cpy.src, cpy.n);
-        }
+    ggml_vk_submit(subctx, src->device->fence);
+    VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
+    src->device->device.resetFences({ src->device->fence });
+    ggml_vk_queue_command_pools_cleanup(src->device);
+
+    for (auto& cpy : subctx->out_memcpys) {
+        memcpy(cpy.dst, cpy.src, cpy.n);
     }
 }
 
@@ -7067,8 +7360,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
 static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
     VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
 
-    if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
-        dst->device->uma) {
+    if (ggml_vk_should_use_uma_direct_transfer(dst, size, true)) {
         deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
         return;
     }
@@ -7080,8 +7372,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
 static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
     VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
 
-    if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
-        dst->device->uma) {
+    if (ggml_vk_should_use_uma_direct_transfer(dst, size, true)) {
         memset((uint8_t*)dst->ptr + offset, c, size);
         return;
     }
@@ -13425,6 +13716,8 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
             memset(mset.dst, mset.val, mset.n);
         }
 
+        ggml_vk_record_host_write_barrier(subctx);
+
         if (almost_ready && !ctx->almost_ready_fence_pending) {
             ggml_vk_submit(subctx, ctx->almost_ready_fence);
             ctx->almost_ready_fence_pending = true;
@@ -13844,6 +14137,13 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
     bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size);
 
     if (!ret) {
+        if (ggml_vk_should_use_uma_direct_transfer(buf, size, true)) {
+            GGML_ASSERT(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
+            ggml_vk_synchronize(ctx);
+            memcpy((uint8_t *) buf->ptr + dst_offset, data, size);
+            return;
+        }
+
         ggml_vk_ensure_sync_staging_buffer(ctx, size);
         ggml_vk_sync_buffers(nullptr, cpy_ctx);
 
@@ -13878,6 +14178,12 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
 
     // If that failed, copy synchronously through a staging buffer
     if (!ret) {
+        if (ggml_vk_should_use_uma_direct_transfer(buf, size, false)) {
+            ggml_vk_synchronize(ctx);
+            memcpy(data, (uint8_t *) buf->ptr + src_offset, size);
+            return;
+        }
+
         ggml_vk_ensure_sync_staging_buffer(ctx, size);
         ggml_vk_sync_buffers(nullptr, compute_ctx);
 
@@ -13971,11 +14277,14 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
             cmd_buf = compute_ctx->s->buffer;
         }
 
-        ggml_vk_ctx_end(compute_ctx);
-
         for (auto& cpy : compute_ctx->in_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
         }
+        for (auto& mset : compute_ctx->memsets) {
+            memset(mset.dst, mset.val, mset.n);
+        }
+        ggml_vk_record_host_write_barrier(compute_ctx);
+        ggml_vk_ctx_end(compute_ctx);
 
         ggml_vk_submit(compute_ctx, {});
         ctx->submit_pending = true;