diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0a7931002ab..923601d5c87 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -982,6 +982,21 @@ struct vk_submission { typedef std::vector vk_sequence; +static size_t ggml_vk_device_type_size(ggml_type type) { + switch (type) { + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q6_K: + return ggml_type_size(type) + 2; + default: return ggml_type_size(type); + } +} + +static size_t ggml_vk_device_size(const ggml_tensor * tensor) { + GGML_ASSERT(ggml_nbytes(tensor) % ggml_type_size(tensor->type) == 0); + return ggml_nbytes(tensor) / ggml_type_size(tensor->type) + * ggml_vk_device_type_size(tensor->type); +} + struct vk_mat_mat_push_constants { uint32_t M; uint32_t N; uint32_t K; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; @@ -1958,16 +1973,18 @@ struct ggml_backend_vk_context { static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT -static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { +static uint64_t vk_tensor_view_offset(const ggml_tensor * tensor) { + GGML_ASSERT(tensor->view_offs % ggml_type_size(tensor->type) == 0); + const size_t dev_view_offs = tensor->view_offs / ggml_type_size(tensor->type) * ggml_vk_device_type_size(tensor->type); if (tensor->view_src) { - return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; + return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base + dev_view_offs; } - return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; + return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base + dev_view_offs; } static uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t) { - return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));; + return vk_tensor_view_offset(t) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); } template void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { @@ -1986,8 +2003,8 @@ template void init_pushconst_tensor_offsets(ggml_backend_vk_context } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_p021_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_vk_device_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.b_offset = b_offset; p.d_offset = d_offset; @@ -1998,8 +2015,8 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_nc_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_vk_device_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.b_offset = b_offset; p.d_offset = d_offset; @@ -3135,7 +3152,7 @@ static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const static std::vector get_fa_spec_constants(const vk_fa_pipeline_state& state) { const auto fa_block_bytes = [](ggml_type t) -> uint32_t { // decodeBufF32 uses a block of vec4s for a better memory access pattern. - return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_type_size(t); + return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_vk_device_type_size(t); }; return { /* 0 WorkGroupSize */ state.workgroup_size, @@ -6592,11 +6609,11 @@ static vk_subbuffer ggml_vk_tensor_subbuffer( if (!buffer) { auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; buffer = buf_ctx->dev_buffer; - offset = vk_tensor_offset(tensor) + tensor->view_offs; + offset = vk_tensor_view_offset(tensor); } GGML_ASSERT(buffer != nullptr); - size_t size = ggml_nbytes(tensor); + size_t size = ggml_vk_device_size(tensor); size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); // The shader must support misaligned offsets when indexing into the buffer @@ -6798,103 +6815,6 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si } } -static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { - VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); - GGML_ASSERT(!ggml_is_contiguous(tensor)); - // Buffer is already mapped - if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ABORT("fatal error"); - } - // Check if src is pinned memory - vk_buffer buf = nullptr; - size_t buf_offset = 0; - ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset); - - const uint64_t ne0 = tensor->ne[0]; - const uint64_t ne1 = tensor->ne[1]; - const uint64_t ne2 = tensor->ne[2]; - const uint64_t ne3 = tensor->ne[3]; - const uint64_t nb0 = tensor->nb[0]; - const uint64_t nb1 = tensor->nb[1]; - const uint64_t nb2 = tensor->nb[2]; - const uint64_t nb3 = tensor->nb[3]; - const ggml_type type = tensor->type; - const uint64_t ts = ggml_type_size(type); - const uint64_t bs = ggml_blck_size(type); - - const uint64_t dstnb0 = ts; - const uint64_t dstnb1 = dstnb0*(ne0/bs); - const uint64_t dstnb2 = dstnb1*ne1; - const uint64_t dstnb3 = dstnb2*ne2; - - const uint64_t ne = ggml_nelements(tensor); - - if (buf != nullptr) { - // Memory is pinned, use as staging buffer - std::vector slices; - - for (uint64_t i3 = 0; i3 < ne3; i3++) { - for (uint64_t i2 = 0; i2 < ne2; i2++) { - // Find longest contiguous slice - if (ne1*nb1 == dstnb2) { - slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 }); - } else { - for (uint64_t i1 = 0; i1 < ne1; i1++) { - if (ne0*nb0/bs == dstnb1) { - slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 }); - } else { - const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; - const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; - for (uint64_t i0 = 0; i0 < ne0; i0++) { - slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 }); - } - } - } - } - } - } - - ggml_vk_sync_buffers(ctx, subctx); - subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices); - return; - } - - if (!sync_staging) { - GGML_ABORT("Asynchronous write to non-pinned memory not supported"); - } - - // Staging buffer required - vk_buffer& staging = ctx->device->sync_staging; - const uint64_t copy_size = ts*ne/bs; - ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); - VkBufferCopy buf_copy{ 0, offset, copy_size }; - - ggml_vk_sync_buffers(ctx, subctx); - vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); - - for (uint64_t i3 = 0; i3 < ne3; i3++) { - for (uint64_t i2 = 0; i2 < ne2; i2++) { - // Find longest contiguous slice - if (ne1*nb1 == dstnb2) { - deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); - } else { - for (uint64_t i1 = 0; i1 < ne1; i1++) { - if (ne0*nb0/bs == dstnb1) { - deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); - } else { - const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; - const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1; - for (uint64_t i0 = 0; i0 < ne0; i0++) { - deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); - } - } - } - } - } - } -} - static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); // Check if src is pinned memory @@ -7006,6 +6926,62 @@ static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1); } +// view_offset is expected to be in device bytes, all other values in host/ggml bytes +static void ggml_vk_buffer_write_2d_padded(vk_buffer& dst, size_t view_offset, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, ggml_type type) { + VK_LOG_DEBUG("ggml_vk_buffer_write_2d_padded(" << width << ", " << height << ")"); + + size_t host_block_size = ggml_type_size(type); + size_t device_block_size = ggml_vk_device_type_size(type); + + if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + for (size_t i = 0; i < height; i++) { + size_t row_pos = 0; + while (row_pos < width) { + size_t host_bytes_write_pos = i * dpitch + offset + row_pos; + size_t block = host_bytes_write_pos / host_block_size; + size_t block_pos = host_bytes_write_pos % host_block_size; + size_t nbytes = std::min(width - row_pos, host_block_size - block_pos); + memcpy((uint8_t *) dst->ptr + view_offset + block * device_block_size + block_pos, + (const uint8_t *) src + i * spitch + row_pos, + nbytes); + row_pos += nbytes; + } + } + } else { + size_t dev_offset = offset / host_block_size * device_block_size + offset % host_block_size; + GGML_ASSERT(height == 1 || dpitch % host_block_size == 0); + size_t dev_stride = dpitch / host_block_size * device_block_size; + + size_t dev_width = width / host_block_size * device_block_size + width % host_block_size; + uint8_t *hostbuf = (uint8_t *)ggml_vk_host_malloc(dst->device, height * dev_width + 2 * device_block_size); + size_t view_start_block_pos = offset % host_block_size; + + for (size_t i = 0; i < height; i++) { + size_t row_pos = 0; + size_t start_block = (i * dpitch + offset) / host_block_size; + while (row_pos < width) { + size_t host_bytes_write_pos = i * dpitch + offset + row_pos; + size_t block = host_bytes_write_pos / host_block_size; + size_t block_pos = host_bytes_write_pos % host_block_size; + size_t nbytes = std::min(width - row_pos, host_block_size - block_pos); + memcpy((uint8_t *) hostbuf + i * dev_width + (block - start_block) * device_block_size + block_pos - view_start_block_pos, + (const uint8_t *) src + i * spitch + row_pos, + nbytes); + row_pos += nbytes; + } + } + ggml_vk_buffer_write_2d(dst, view_offset + dev_offset, hostbuf, dev_width, dev_stride, dev_width, height); + ggml_vk_host_free(dst->device, hostbuf); + } +} + +static void ggml_vk_buffer_write_padded(vk_buffer& dst, size_t view_offset, size_t offset, const void * src, size_t size, ggml_type type) { + VK_LOG_DEBUG("ggml_vk_buffer_write_padded(" << size << ")"); + ggml_vk_buffer_write_2d_padded(dst, view_offset, offset, src, size, size, size, 1, type); +} + static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); GGML_ASSERT(width > 0); @@ -7122,6 +7098,62 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1); } +// view_offset is expected to be in device bytes, all other values in host/ggml bytes +static void ggml_vk_buffer_read_2d_padded(vk_buffer& src, size_t view_offset, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, ggml_type type) { + VK_LOG_DEBUG("ggml_vk_buffer_read_2d_padded(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")"); + + size_t host_block_size = ggml_type_size(type); + size_t device_block_size = ggml_vk_device_type_size(type); + + if (src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); + + for (size_t i = 0; i < height; i++) { + size_t row_pos = 0; + while (row_pos < width) { + size_t host_bytes_read_pos = i * spitch + offset + row_pos; + size_t block = host_bytes_read_pos / host_block_size; + size_t block_pos = host_bytes_read_pos % host_block_size; + size_t nbytes = std::min(width - row_pos, host_block_size - block_pos); + memcpy((uint8_t *) dst + i * dpitch + row_pos, + (const uint8_t *) src->ptr + view_offset + block * device_block_size + block_pos, + nbytes); + row_pos += nbytes; + } + } + } else { + size_t dev_offset = offset / host_block_size * device_block_size + offset % host_block_size; + GGML_ASSERT(height == 1 || spitch % host_block_size == 0); + size_t dev_stride = spitch / host_block_size * device_block_size; + size_t view_start_block_pos = offset % host_block_size; + + size_t dev_width = width / host_block_size * device_block_size + width % host_block_size; + uint8_t *hostbuf = (uint8_t *)ggml_vk_host_malloc(src->device, height * dev_width + 2 * device_block_size); + ggml_vk_buffer_read_2d(src, view_offset + dev_offset, hostbuf, dev_stride, dev_width, dev_width, height); + + for (size_t i = 0; i < height; i++) { + size_t row_pos = 0; + size_t start_block = (i * spitch + offset) / host_block_size; + while (row_pos < width) { + size_t host_bytes_read_pos = i * spitch + offset + row_pos; + size_t block = host_bytes_read_pos / host_block_size; + size_t block_pos = host_bytes_read_pos % host_block_size; + size_t nbytes = std::min(width - row_pos, host_block_size - block_pos); + memcpy((uint8_t *) dst + i * dpitch + row_pos, + (const uint8_t *) hostbuf + i * dev_width + (block - start_block) * device_block_size + block_pos - view_start_block_pos, + nbytes); + row_pos += nbytes; + } + } + ggml_vk_host_free(src->device, hostbuf); + } +} + +static void ggml_vk_buffer_read_padded(vk_buffer& src, size_t view_offset, size_t offset, void * dst, size_t size, ggml_type type) { + VK_LOG_DEBUG("ggml_vk_buffer_read_padded(" << src->buffer << ", " << offset << ", " << size << ")"); + ggml_vk_buffer_read_2d_padded(src, view_offset, offset, dst, size, size, size, 1, type); +} + static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); // Make sure both buffers are on same device @@ -7391,9 +7423,9 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src); if (transpose && src->type == to) { - if (ggml_type_size(to) == 4) { + if (ggml_vk_device_type_size(to) == 4) { return ctx->device->pipeline_cpy_transpose_32; - } else if (ggml_type_size(to) == 2) { + } else if (ggml_vk_device_type_size(to) == 2) { return ctx->device->pipeline_cpy_transpose_16; } } @@ -7482,8 +7514,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const // For quantized types, we scale by block size/type size. But // this path is also used for bf16->bf16 for example, where the // type size must be exactly 2 or 4. - GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4); - if ((ggml_type_size(src->type) % 4) == 0) { + GGML_ASSERT(ggml_is_quantized(to) || ggml_vk_device_type_size(src->type) == 2 || ggml_vk_device_type_size(src->type) == 4); + if ((ggml_vk_device_type_size(src->type) % 4) == 0) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_f32; } else { @@ -7657,7 +7689,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? f16_type : src0->type, quantize_y ? GGML_TYPE_Q8_1 : (y_f32_kernel ? GGML_TYPE_F32 : src1->type)); - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + if (ggml_vk_device_size(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } @@ -7670,10 +7702,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, disable_split_k, pipeline); - const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); - const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); + const uint64_t qx_sz = ggml_vk_device_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); + const uint64_t qy_sz = ggml_vk_device_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_vk_device_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); const uint64_t d_sz = sizeof(float) * d_ne; vk_pipeline to_fp16_vk_0 = nullptr; @@ -7734,7 +7766,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + const uint64_t d_buf_offset = vk_tensor_view_offset(dst); GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz); vk_buffer d_X; @@ -7743,12 +7775,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub uint64_t y_buf_offset = 0; if (!src0_uma) { d_Qx = src0_buf_ctx->dev_buffer; - qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; + qx_buf_offset = vk_tensor_view_offset(src0); GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { d_Qy = src1_buf_ctx->dev_buffer; - qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; + qy_buf_offset = vk_tensor_view_offset(src1); GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -7844,11 +7876,6 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return false; } - // General performance issue with q3_k and q6_k due to 2-byte alignment - if (src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { - return false; - } - // MMVQ is generally good for batches if (n > 1) { return true; @@ -7975,7 +8002,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); } - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + if (ggml_vk_device_size(src0) > ctx->device->properties.limits.maxStorageBufferRange) { dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); } @@ -7992,9 +8019,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t x_ne = ggml_nelements(src0); const uint64_t y_ne = ggml_nelements(src1); - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : + const uint64_t qx_sz = ggml_vk_align_size(ggml_vk_device_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_vk_device_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_vk_device_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); { @@ -8046,11 +8073,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& ggml_vk_sync_buffers(ctx, subctx); } - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_vk_device_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); } if (y_non_contig) { - GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); + GGML_ASSERT(y_sz == ggml_vk_device_type_size(src1->type) * y_ne); if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { @@ -8182,7 +8209,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1]; - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + if (ggml_vk_device_size(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } @@ -8281,7 +8308,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint32_t channel_stride_y = nb12 / sizeof(float); vk_pipeline pipeline = ctx->device->pipeline_mul_mat_vec_nc_f16_f32; - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + if (ggml_vk_device_size(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } @@ -8344,7 +8371,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c // where the M dimension is very large. // Split_k doesn't work with M splitting. // This only supports batchsize == 1. - const size_t nbytes = ggml_nbytes(src0); + const size_t nbytes = ggml_vk_device_size(src0); const bool needs_split = dst->ne[2] == 1 && dst->ne[3] == 1 && nbytes > ctx->device->properties.limits.maxStorageBufferRange; if (needs_split) { // Choose the number of rows that can fit (and divide by two, to allow for any additional offsets) @@ -8492,7 +8519,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? f16_type : src0->type); - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + if (ggml_vk_device_size(src0) > ctx->device->properties.limits.maxStorageBufferRange) { pipeline = ggml_vk_get_64b_indexing_pipeline(ctx, pipeline); } // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking @@ -8501,10 +8528,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t y_ne = padded_n * ne10 * ne12 * ne13; const uint64_t d_ne = ggml_nelements(dst); - const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); - const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); + const uint64_t qx_sz = ggml_vk_device_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); + const uint64_t qy_sz = ggml_vk_device_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_vk_device_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); const uint64_t ids_sz = nbi2; const uint64_t d_sz = sizeof(float) * d_ne; @@ -8566,7 +8593,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } vk_buffer d_D = dst_buf_ctx->dev_buffer; - const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + const uint64_t d_buf_offset = vk_tensor_view_offset(dst); GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; @@ -8574,17 +8601,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& uint64_t y_buf_offset = 0; if (!src0_uma) { d_Qx = src0_buf_ctx->dev_buffer; - qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; + qx_buf_offset = vk_tensor_view_offset(src0); GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { d_Qy = src1_buf_ctx->dev_buffer; - qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; + qy_buf_offset = vk_tensor_view_offset(src1); GGML_ASSERT(d_Qy != nullptr); } if (!ids_uma) { d_ids = ids_buf_ctx->dev_buffer; - ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; + ids_buf_offset = vk_tensor_view_offset(ids); GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -8622,7 +8649,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& (uint32_t)nei1, (uint32_t)(nbi0 / ggml_type_size(ids->type)), (uint32_t)(nbi1 / ggml_type_size(ids->type)), - (uint32_t)(get_misalign_bytes(ctx, ids) / ggml_type_size(ids->type)) }; + (uint32_t)(get_misalign_bytes(ctx, ids) / ggml_vk_device_type_size(ids->type)) }; ggml_vk_dispatch_pipeline(ctx, subctx, count_experts, { vk_subbuffer{ d_ids, ids_buf_offset, ids_sz }, expert_count_buf }, pc, { (uint32_t)n_as, 1, 1}); } @@ -8755,7 +8782,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const bool qx_needs_dequant = x_non_contig; const bool qy_needs_dequant = !quantize_y && ((src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig); - if (ggml_nbytes(src0) > ctx->device->properties.limits.maxStorageBufferRange) { + if (ggml_vk_device_size(src0) > ctx->device->properties.limits.maxStorageBufferRange) { dmmv = ggml_vk_get_64b_indexing_pipeline(ctx, dmmv); } @@ -8768,9 +8795,9 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const uint64_t x_ne = ggml_nelements(src0); const uint64_t y_ne = ggml_nelements(src1); - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; - const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : + const uint64_t qx_sz = ggml_vk_align_size(ggml_vk_device_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_vk_device_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t y_sz = quantize_y ? (ggml_vk_align_size(y_ne, 128) * ggml_vk_device_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); { @@ -8826,11 +8853,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte } if (x_non_contig) { - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_vk_device_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, d_Qx, d_X); } if (y_non_contig) { - GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); + GGML_ASSERT(y_sz == ggml_vk_device_type_size(src1->type) * y_ne); if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || ctx->prealloc_y_last_tensor_used != src1) { if (ctx->prealloc_y_need_sync) { @@ -9953,8 +9980,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_vk_device_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; @@ -9964,8 +9991,8 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_vk_device_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; @@ -9975,8 +10002,8 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_vk_device_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; @@ -9986,8 +10013,8 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_vk_device_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.misalign_offsets = (a_offset << 16) | d_offset; @@ -9997,9 +10024,9 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_vk_device_type_size(src0->type); + const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_vk_device_type_size(src1->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0)); @@ -10010,8 +10037,8 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { - const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); - const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_vk_device_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_vk_device_type_size(dst->type); p.a_offset = a_offset; p.d_offset = d_offset; @@ -10259,10 +10286,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { // Convert from number of logical elements to 2- or 4-byte units. ne /= ggml_blck_size(src0->type); - if ((ggml_type_size(src0->type) % 4) == 0) { - ne *= ggml_type_size(src0->type) / 4; + if ((ggml_vk_device_type_size(src0->type) % 4) == 0) { + ne *= ggml_vk_device_type_size(src0->type) / 4; } else { - ne *= ggml_type_size(src0->type) / 2; + ne *= ggml_vk_device_type_size(src0->type) / 2; } } // copy_to_quant has block size of 32, and each thread does QUANT_K elements. @@ -10472,7 +10499,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, } if (!uma[i]) { buf[i] = buf_ctx[i]->dev_buffer; - offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; + offset[i] = vk_tensor_view_offset(tensors[i]); } GGML_ASSERT(buf[i] != nullptr); } @@ -11035,10 +11062,10 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { // Convert from number of logical elements to 2- or 4-byte units. ne /= ggml_blck_size(src0->type); - if ((ggml_type_size(src0->type) % 4) == 0) { - ne *= ggml_type_size(src0->type) / 4; + if ((ggml_vk_device_type_size(src0->type) % 4) == 0) { + ne *= ggml_vk_device_type_size(src0->type) / 4; } else { - ne *= ggml_type_size(src0->type) / 2; + ne *= ggml_vk_device_type_size(src0->type) / 2; } } @@ -11233,7 +11260,7 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, } if (!uma[i]) { buf[i] = buf_ctx[i]->dev_buffer; - offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs; + offset[i] = vk_tensor_view_offset(tensors[i]); } GGML_ASSERT(buf[i] != nullptr); } @@ -11838,7 +11865,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; const vk_buffer d_buf = d_buf_ctx->dev_buffer; - const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; + const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_view_offset(dst); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL, { dst_addr, @@ -11880,7 +11907,7 @@ static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; const vk_buffer d_buf = d_buf_ctx->dev_buffer; - const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs; + const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_view_offset(dst); vk_op_im2col_3d_push_constants pc {}; @@ -13049,16 +13076,16 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr if (unsynced_nodes.size() == 0) { return false; } - auto n_base = vk_tensor_offset(node) + node->view_offs; - auto n_size = ggml_nbytes(node); + auto n_base = vk_tensor_view_offset(node); + auto n_size = ggml_vk_device_size(node); ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)node->buffer->context; vk_buffer a_buf = a_buf_ctx->dev_buffer; for (auto &other : unsynced_nodes) { ggml_backend_vk_buffer_context * o_buf_ctx = (ggml_backend_vk_buffer_context *)other->buffer->context; vk_buffer o_buf = o_buf_ctx->dev_buffer; if (a_buf == o_buf) { - auto o_base = vk_tensor_offset(other) + other->view_offs; - auto o_size = ggml_nbytes(other); + auto o_base = vk_tensor_view_offset(other); + auto o_size = ggml_vk_device_size(other); if ((o_base <= n_base && n_base < o_base + o_size) || (n_base <= o_base && o_base < n_base + n_size)) { @@ -13688,7 +13715,11 @@ static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, g } uint32_t val32 = (uint32_t)value * 0x01010101; - ggml_vk_buffer_memset(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, val32, size); + size_t device_offset = offset / ggml_type_size(tensor->type) * ggml_vk_device_type_size(tensor->type) + + offset % ggml_type_size(tensor->type); + size_t device_size = size / ggml_type_size(tensor->type) * ggml_vk_device_type_size(tensor->type) + + size % ggml_type_size(tensor->type); + ggml_vk_buffer_memset(buf, vk_tensor_view_offset(tensor) + device_offset, val32, device_size); } static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -13700,7 +13731,11 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml return; } - ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); + if (ggml_nbytes(tensor) != ggml_vk_device_size(tensor)) { + ggml_vk_buffer_write_padded(buf, vk_tensor_view_offset(tensor), offset, data, size, tensor->type); + } else { + ggml_vk_buffer_write(buf, vk_tensor_view_offset(tensor) + offset, data, size); + } } static void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, @@ -13714,7 +13749,11 @@ static void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, g return; } - ggml_vk_buffer_write_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_data, stride_tensor, size, n_copies); + if (ggml_nbytes(tensor) != ggml_vk_device_size(tensor)) { + ggml_vk_buffer_write_2d_padded(buf, vk_tensor_view_offset(tensor), offset, data, stride_data, stride_tensor, size, n_copies, tensor->type); + } else { + ggml_vk_buffer_write_2d(buf, vk_tensor_view_offset(tensor) + offset, data, stride_data, stride_tensor, size, n_copies); + } } static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -13727,7 +13766,11 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); + if (ggml_nbytes(tensor) != ggml_vk_device_size(tensor)) { + ggml_vk_buffer_read_padded(buf, vk_tensor_view_offset(tensor), offset, data, size, tensor->type); + } else { + ggml_vk_buffer_read(buf, vk_tensor_view_offset(tensor) + offset, data, size); + } } static void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, @@ -13742,7 +13785,11 @@ static void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, c vk_buffer buf = buf_ctx->dev_buffer; - ggml_vk_buffer_read_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_tensor, stride_data, size, n_copies); + if (ggml_nbytes(tensor) != ggml_vk_device_size(tensor)) { + ggml_vk_buffer_read_2d_padded(buf, vk_tensor_view_offset(tensor), offset, data, stride_tensor, stride_data, size, n_copies, tensor->type); + } else { + ggml_vk_buffer_read_2d(buf, vk_tensor_view_offset(tensor) + offset, data, stride_tensor, stride_data, size, n_copies); + } } static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { @@ -13757,7 +13804,7 @@ static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, cons vk_buffer src_buf = src_buf_ctx->dev_buffer; vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy(dst_buf, vk_tensor_view_offset(dst), src_buf, vk_tensor_view_offset(src), ggml_vk_device_size(src)); return true; } @@ -13820,7 +13867,7 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_ } static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - return ggml_nbytes(tensor); + return ggml_vk_device_size(tensor); UNUSED(buft); } @@ -13946,6 +13993,13 @@ static void ggml_backend_vk_set_tensor_2d_async(ggml_backend_t backend, ggml_ten return; } + if (ggml_nbytes(tensor) != ggml_vk_device_size(tensor)) { + ggml_vk_synchronize(ctx); + ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + ggml_backend_vk_buffer_set_tensor_2d(buffer, tensor, data, offset, size, n_copies, stride_tensor, stride_data); + return; + } + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context cpy_ctx; @@ -13964,7 +14018,7 @@ static void ggml_backend_vk_set_tensor_2d_async(ggml_backend_t backend, ggml_ten vk_buffer buf = buf_ctx->dev_buffer; - auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; + auto dst_offset = vk_tensor_view_offset(tensor) + offset; bool ret = ggml_vk_buffer_write_2d_async(cpy_ctx, buf, dst_offset, data, stride_data, stride_tensor, size, n_copies); @@ -14015,13 +14069,20 @@ static void ggml_backend_vk_get_tensor_2d_async(ggml_backend_t backend, const gg return; } + if (ggml_nbytes(tensor) != ggml_vk_device_size(tensor)) { + ggml_vk_synchronize(ctx); + ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + ggml_backend_vk_buffer_get_tensor_2d(buffer, tensor, data, offset, size, n_copies, stride_tensor, stride_data); + return; + } + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); vk_buffer buf = buf_ctx->dev_buffer; - auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; + auto src_offset = vk_tensor_view_offset(tensor) + offset; bool ret = ggml_vk_buffer_read_2d_async(compute_ctx, buf, src_offset, data, stride_tensor, stride_data, size, n_copies); if (!ret) { @@ -14062,7 +14123,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ } static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")"); + VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_vk_device_size(src) << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; // Skip zero-size tensors @@ -14087,13 +14148,17 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, - src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs, - ggml_nbytes(src)); + ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_view_offset(dst), + src_buf_ctx->dev_buffer, vk_tensor_view_offset(src), + ggml_vk_device_size(src)); return true; } if (ggml_backend_buffer_is_host(src->buffer)) { + if (ggml_vk_device_size(src) != ggml_nbytes(src)) { + return false; + } + vk_buffer pinned_buf = nullptr; size_t pinned_offset = 0; ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset); @@ -14115,7 +14180,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_ba } return ggml_vk_buffer_write_async(cpy_ctx, dst_buf, - vk_tensor_offset(dst) + dst->view_offs, + vk_tensor_view_offset(dst), src->data, ggml_nbytes(src)); } @@ -14492,10 +14557,10 @@ static bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context; vk_buffer b_buf = b_buf_ctx->dev_buffer; if (a_buf == b_buf) { - auto a_base = vk_tensor_offset(a) + a->view_offs; - auto a_size = ggml_nbytes(a); - auto b_base = vk_tensor_offset(b) + b->view_offs; - auto b_size = ggml_nbytes(b); + auto a_base = vk_tensor_view_offset(a); + auto a_size = ggml_vk_device_size(a); + auto b_base = vk_tensor_view_offset(b); + auto b_size = ggml_vk_device_size(b); if (elementwise && a_base == b_base && a_size == b_size) { return false; @@ -15505,11 +15570,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm }; // reject any tensors larger than the max buffer size for (int i = 0; i < GGML_MAX_SRC; i++) { - if (op->src[i] && !tensor_size_supported(ggml_nbytes(op->src[i]))) { + if (op->src[i] && !tensor_size_supported(ggml_vk_device_size(op->src[i]))) { return false; } } - if (!tensor_size_supported(ggml_nbytes(op))) { + if (!tensor_size_supported(ggml_vk_device_size(op))) { return false; } @@ -15780,7 +15845,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm // so the type/block size must be a multiple of 4. if (src0_type == src1_type && (!ggml_is_quantized(src0_type) || (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op))) && - (ggml_type_size(src0_type) % 2) == 0) { + (ggml_vk_device_type_size(src0_type) % 2) == 0) { return true; } return false; @@ -16447,11 +16512,7 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) if (is_gpu) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - - vk_buffer buffer_gpu = buf_ctx->dev_buffer; - ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size); + ggml_backend_tensor_get(tensor, tensor_data, 0, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -16531,14 +16592,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * memcpy(srci_clone->data, srci->data, srci_size); memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); } else if (ggml_backend_buffer_is_vk(srci->buffer)) { - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context; - vk_buffer& buffer_gpu = buf_ctx->dev_buffer; - uint64_t offset = vk_tensor_offset(srci) + srci->view_offs; if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) { for (int i3 = 0; i3 < srci->ne[3]; i3++) { for (int i2 = 0; i2 < srci->ne[2]; i2++) { const int idx = i3*srci->ne[2] + i2; - ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]); + ggml_backend_tensor_get(srci, (char *)srci_clone->data + idx * srci_clone->nb[2], idx * srci->nb[2], srci->ne[1] * srci->nb[1]); } } @@ -16548,10 +16606,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1]; } } else { - if (offset + srci_size >= buffer_gpu->size) { - srci_size = buffer_gpu->size - offset; - } - ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size); + ggml_backend_tensor_get(srci, srci_clone->data, 0, srci_size); memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -16944,16 +16999,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * if (ggml_backend_buffer_is_vk(tensor->buffer)) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - - ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - - vk_buffer& buffer_gpu = buf_ctx->dev_buffer; - uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs; - if (offset + tensor_size >= buffer_gpu->size) { - tensor_size = buffer_gpu->size - offset; - } - - ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size); + ggml_backend_tensor_get(tensor, tensor_data, 0, tensor_size); } float first_error_result = -1.0f; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl index c582aba87dc..e86838f41e5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl @@ -147,7 +147,7 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2 return ret; } -layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K { +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ3_K { block_q3_K block; }; @@ -399,7 +399,7 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2 return float16_t(ret); } -layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K { +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ6_K { block_q6_K block; }; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl index bc580aeeb83..5f5e205fe77 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl @@ -203,7 +203,7 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) { #endif #if defined(DATA_A_Q3_K) -// 2-byte loads for Q3_K blocks (110 bytes) +// 4-byte loads for Q3_K blocks (110 bytes + 2 bytes padding) i32vec4 repack4(uint ib, uint iqs) { const uint ib_k = ib / 8; const uint iqs_k = (ib % 8) * 8 + iqs; @@ -212,28 +212,33 @@ i32vec4 repack4(uint ib, uint iqs) { const uint qs_shift = ((iqs_k % 32) / 8) * 2; const uint hm_shift = iqs_k / 8; + const uvec4 qs = uvec4(data_a_packed32[ib_k].qs[qs_idx ], + data_a_packed32[ib_k].qs[qs_idx + 1], + data_a_packed32[ib_k].qs[qs_idx + 2], + data_a_packed32[ib_k].qs[qs_idx + 3]); + + const uvec4 hmask = uvec4(data_a_packed32[ib_k].hmask[iqs ], + data_a_packed32[ib_k].hmask[iqs + 1], + data_a_packed32[ib_k].hmask[iqs + 2], + data_a_packed32[ib_k].hmask[iqs + 3]); + // bitwise OR to add 4 if hmask is set, subtract later - const i8vec2 vals00 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 ] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 ] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals01 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 1] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 1] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals10 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 2] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 2] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals11 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 3] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 3] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals20 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 4] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 4] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals21 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 5] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 5] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals30 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 6] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 6] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals31 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 7] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 7] >> hm_shift) & uint16_t(0x0101)) << 2)); - - return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y) - int8_t(4)), - pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y) - int8_t(4)), - pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y) - int8_t(4)), - pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y) - int8_t(4))); + const uint vals0 = (( qs.x >> qs_shift) & 0x03030303) | + (((hmask.x >> hm_shift) & 0x01010101) << 2); + const uint vals1 = (( qs.y >> qs_shift) & 0x03030303) | + (((hmask.y >> hm_shift) & 0x01010101) << 2); + const uint vals2 = (( qs.z >> qs_shift) & 0x03030303) | + (((hmask.z >> hm_shift) & 0x01010101) << 2); + const uint vals3 = (( qs.w >> qs_shift) & 0x03030303) | + (((hmask.w >> hm_shift) & 0x01010101) << 2); + + // Subtract 4 by twiddling bits rather than using re-packing as we have + // the high bit to ourselves and can avoid the hardware having to deal with + // the the ints separately. + return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals1 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals2 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals3 ^ 0x80808080) - 0x04040404) ^ 0x80808080)); } float get_d_scale(uint ib, uint iqs) { @@ -332,7 +337,7 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) { #endif #if defined(DATA_A_Q6_K) -// 2-byte loads for Q6_K blocks (210 bytes) +// 4-byte loads for Q6_K blocks (210 bytes + 2 bytes of padding) i32vec4 repack4(uint ib, uint iqs) { const uint ib_k = ib / 8; const uint iqs_k = (ib % 8) * 8 + iqs; @@ -343,27 +348,32 @@ i32vec4 repack4(uint ib, uint iqs) { const uint qh_idx = (iqs_k / 32) * 8 + iqs; const uint qh_shift = ((iqs_k % 32) / 8) * 2; - const i8vec2 vals00 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 ] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 ] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals01 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 1] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 1] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals10 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 2] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 2] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals11 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 3] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 3] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals20 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 4] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 4] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals21 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 5] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 5] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals30 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 6] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 6] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals31 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 7] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 7] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - - return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y)), - pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y)), - pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y)), - pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y))); + const uvec4 ql = uvec4(data_a_packed32[ib_k].ql[ql_idx ], + data_a_packed32[ib_k].ql[ql_idx + 1], + data_a_packed32[ib_k].ql[ql_idx + 2], + data_a_packed32[ib_k].ql[ql_idx + 3]); + + const uvec4 qh = uvec4(data_a_packed32[ib_k].qh[qh_idx ], + data_a_packed32[ib_k].qh[qh_idx + 1], + data_a_packed32[ib_k].qh[qh_idx + 2], + data_a_packed32[ib_k].qh[qh_idx + 3]); + + const uint vals0 = (( ql.x >> ql_shift) & 0x0F0F0F0F) | + (((qh.x >> qh_shift) & 0x03030303) << 4); + const uint vals1 = (( ql.y >> ql_shift) & 0x0F0F0F0F) | + (((qh.y >> qh_shift) & 0x03030303) << 4); + const uint vals2 = (( ql.z >> ql_shift) & 0x0F0F0F0F) | + (((qh.z >> qh_shift) & 0x03030303) << 4); + const uint vals3 = (( ql.w >> ql_shift) & 0x0F0F0F0F) | + (((qh.w >> qh_shift) & 0x03030303) << 4); + + // Subtract 32 by twiddling bits rather than using re-packing as we have + // the high bits to ourselves and can avoid the hardware having to deal + // with the the ints separately. + return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals1 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals2 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals3 ^ 0x80808080) - 0x20202020) ^ 0x80808080)); } float get_d_scale(uint ib, uint iqs) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl index 4bcd97756fd..2e940fff410 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl @@ -280,6 +280,7 @@ struct block_q3_K uint8_t qs[QUANT_K_Q3_K/4]; uint8_t scales[12]; float16_t d; + uint16_t _pad; }; struct block_q3_K_packed16 @@ -288,6 +289,16 @@ struct block_q3_K_packed16 uint16_t qs[QUANT_K_Q3_K/4/2]; uint16_t scales[12/2]; float16_t d; + uint16_t _pad; +}; + +struct block_q3_K_packed32 +{ + uint32_t hmask[QUANT_K_Q3_K/8/4]; + uint32_t qs[QUANT_K_Q3_K/4/4]; + uint32_t scales[12/4]; + float16_t d; + uint16_t _pad; }; #if defined(DATA_A_Q3_K) @@ -295,6 +306,7 @@ struct block_q3_K_packed16 #define QUANT_R 1 #define A_TYPE block_q3_K #define A_TYPE_PACKED16 block_q3_K_packed16 +#define A_TYPE_PACKED32 block_q3_K_packed32 #define DATA_A_QUANT_K #endif @@ -383,6 +395,7 @@ struct block_q6_K uint8_t qh[QUANT_K_Q6_K/4]; int8_t scales[QUANT_K_Q6_K/16]; float16_t d; + uint16_t _pad; }; struct block_q6_K_packed16 @@ -391,6 +404,16 @@ struct block_q6_K_packed16 uint16_t qh[QUANT_K_Q6_K/4/2]; int16_t scales[QUANT_K_Q6_K/16/2]; float16_t d; + uint16_t _pad; +}; + +struct block_q6_K_packed32 +{ + uint32_t ql[QUANT_K_Q6_K/2/4]; + uint32_t qh[QUANT_K_Q6_K/4/4]; + int32_t scales[QUANT_K_Q6_K/16/4]; + float16_t d; + uint16_t _pad; }; #if defined(DATA_A_Q6_K) @@ -398,6 +421,7 @@ struct block_q6_K_packed16 #define QUANT_R 1 #define A_TYPE block_q6_K #define A_TYPE_PACKED16 block_q6_K_packed16 +#define A_TYPE_PACKED32 block_q6_K_packed32 #define DATA_A_QUANT_K #endif