Skip to content
Merged
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 113 additions & 35 deletions ggml/src/ggml-webgpu/ggml-webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include <webgpu/webgpu_cpp.h>

#include <atomic>
#include <condition_variable>
#include <cstdint>
#include <cstring>
#ifdef GGML_WEBGPU_GPU_PROFILE
Expand All @@ -25,7 +24,6 @@
#if defined(GGML_WEBGPU_DEBUG) || defined(GGML_WEBGPU_CPU_PROFILE) || defined(GGML_WEBGPU_GPU_PROFILE)
# include <iostream>
#endif
#include <map>
#include <memory>
#include <mutex>
#include <optional>
Expand Down Expand Up @@ -81,13 +79,13 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim

/* Constants */

#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 32u
#define WEBGPU_NUM_PARAM_SLOTS \
(WEBGPU_COMMAND_SUBMIT_BATCH_SIZE + 10) // a few extra for safety, since some operations may need multiple slots
#define WEBGPU_WAIT_ANY_TIMEOUT_MS 100
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
#define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4

// For operations which process a row in parallel, this seems like a reasonable
// default
Expand Down Expand Up @@ -252,6 +250,8 @@ struct webgpu_global_context_struct {
wgpu::Adapter adapter;
wgpu::Device device;
wgpu::Queue queue;
uint32_t command_submit_batch_size = WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
uint32_t max_inflight_batches = UINT32_MAX;

webgpu_capabilities capabilities;
// Shared buffer to move data from device to host
Expand Down Expand Up @@ -417,31 +417,104 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
}
#endif

template <typename T>
static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
T callback_status,
T success_status,
const char * wait_name,
const char * failure_name,
const char * callback_message) {
if (wait_status == wgpu::WaitStatus::TimedOut) {
GGML_ABORT("ggml_webgpu: %s timed out after %u ms\n", wait_name, WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
}
if (wait_status == wgpu::WaitStatus::Error) {
GGML_ABORT("ggml_webgpu: %s failed\n", wait_name);
}
if (callback_status != success_status) {
GGML_ABORT("ggml_webgpu: %s failed with status %d: %s\n", failure_name, static_cast<int>(callback_status),
callback_message);
}
}

#ifdef __EMSCRIPTEN__
// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
EM_JS(int, ggml_webgpu_is_ios_browser, (), {
const ua = navigator.userAgent;
return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
});
#endif

static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) {
#ifdef __EMSCRIPTEN__
if (ggml_webgpu_is_ios_browser()) {
return 1;
}
#else
GGML_UNUSED(info);
#endif

return UINT32_MAX;
}

static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) {
#ifdef __EMSCRIPTEN__
if (ggml_webgpu_is_ios_browser()) {
return 16;
}
#else
GGML_UNUSED(info);
#endif

return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
}

static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {
ctx->instance.WaitAny(
ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
std::string(message).c_str());
}
}),
UINT64_MAX);
wgpu::QueueWorkDoneStatus callback_status = wgpu::QueueWorkDoneStatus::Error;
std::string callback_message;

const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
ctx->queue.OnSubmittedWorkDone(
wgpu::CallbackMode::AllowSpontaneous,
[&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
callback_status = status;
callback_message = std::string(message);
}),
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);

ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::QueueWorkDoneStatus::Success,
"Queue wait", "Queue work", callback_message.c_str());
}

static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
wgpu::Buffer & buffer,
wgpu::MapMode mode,
size_t offset,
size_t size) {
ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
[](wgpu::MapAsyncStatus status, wgpu::StringView message) {
if (status != wgpu::MapAsyncStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
message.data);
}
}),
UINT64_MAX);
wgpu::MapAsyncStatus callback_status = wgpu::MapAsyncStatus::Error;
std::string callback_message;

const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
[&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
callback_status = status;
callback_message = std::string(message);
}),
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);

ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::MapAsyncStatus::Success,
"Buffer map wait", "Buffer map", callback_message.c_str());
}

static void ggml_backend_webgpu_submit_commands(webgpu_context & ctx,
const wgpu::CommandBuffer commands,
uint32_t & num_inflight_batches) {
if (num_inflight_batches >= ctx->global_ctx->max_inflight_batches) {
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
num_inflight_batches = 0;
}

ctx->global_ctx->queue.Submit(1, &commands);
num_inflight_batches++;
}

#ifdef GGML_WEBGPU_DEBUG
Expand Down Expand Up @@ -2871,9 +2944,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
#ifdef GGML_WEBGPU_GPU_PROFILE
std::vector<wgpu::FutureWaitInfo> profile_futures;
#endif
uint32_t num_batched_kernels = 0;
bool contains_set_rows = false;
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
uint32_t num_batched_kernels = 0;
uint32_t num_inflight_batches = 0;
bool contains_set_rows = false;
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();

for (int i = 0; i < cgraph->n_nodes; i++) {
if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
Expand All @@ -2884,10 +2958,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
num_batched_kernels += cmd.value().num_kernels;
}

if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) {
num_batched_kernels = 0;
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
ctx->global_ctx->queue.Submit(1, &batch_commands);
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
#ifdef GGML_WEBGPU_GPU_PROFILE
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
#endif
Expand All @@ -2898,7 +2972,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
}
if (!commands.empty()) {
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
ctx->global_ctx->queue.Submit(1, &batch_commands);
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
#ifdef GGML_WEBGPU_GPU_PROFILE
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
#endif
Expand All @@ -2912,7 +2986,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
ctx->set_rows_host_error_buf.GetSize());
wgpu::CommandBuffer set_rows_commands = encoder.Finish();
ctx->global_ctx->queue.Submit(1, &set_rows_commands);
ggml_backend_webgpu_submit_commands(ctx, set_rows_commands, num_inflight_batches);
}

ggml_backend_webgpu_wait_queue(ctx->global_ctx);
Expand Down Expand Up @@ -3363,6 +3437,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
}
#endif
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info);
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(info);
wgpu::SupportedFeatures features;
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
// we require f16 support
Expand Down Expand Up @@ -3483,8 +3559,10 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
webgpu_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
webgpu_ctx->param_arena.init(webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES, WEBGPU_NUM_PARAM_SLOTS,
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
webgpu_ctx->param_arena.init(
webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->set_rows_dev_error_buf,
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "set_rows_dev_error_buf");
Expand Down
Loading