Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion cmake/external/onnxruntime_external_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,29 @@ if (onnxruntime_USE_WEBGPU)
# # if we need to apply patches in the future, we can uncomment the following line.
#
# The dawn.patch contains the following changes:
# - https://dawn-review.googlesource.com/c/dawn/+/225514
#
# - (public) CMake fix to support Emscripten v4.0.3+
# This change allows Dawn to find the file "gen_struct_info.py" in the correct location.
# https://dawn-review.googlesource.com/c/dawn/+/225514
#
# - (public) Fix emwgpu C++ implementation for buffer destroy
# In native implementation, wgpuBufferRelease will trigger the buffer destroy (if refcount decreased to 0). But
# in emwgpu implementation, the buffer destroy won't happen. This change fixes the bug.
# https://dawn-review.googlesource.com/c/dawn/+/226315
#
# - (private) Allow "external" buffer in emwgpu C++ implementation
# This change allows WGPUBufferImpl to destroy the buffer when the refcount decreased to 0 only for non-external
# buffer.
# "external buffer" means the GPUBuffer instance created in JavaScript and imported to C++ by `importJsBuffer`.
#
# - (private) Remove hard-coded CMAKE_OSX_DEPLOYMENT_TARGET in Dawn's CMake files
# https://github.com/microsoft/onnxruntime/pull/23729
#
# - (private) Fix external ref count for "external" device in emwgpu C++ implementation
# This change fixes the incorrect external ref count for class WGPUDeviceImpl when used with "external" device.
# "external device" means the GPUDevice instance created in JavaScript and imported to C++ by `importJsDevice`.
#
#
PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
EXCLUDE_FROM_ALL
)
Expand Down
37 changes: 28 additions & 9 deletions cmake/onnxruntime_webassembly.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,14 @@ else()
target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard)
endif()

set(onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre.js")

set(EXPORTED_FUNCTIONS "_malloc,_free")
if (onnxruntime_USE_JSEP)
set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName")
else()
set(EXPORTED_FUNCTIONS "_malloc,_free")
string(APPEND EXPORTED_FUNCTIONS ",_JsepOutput,_JsepGetNodeName")
endif()
if (onnxruntime_USE_WEBGPU)
string(APPEND EXPORTED_FUNCTIONS ",_wgpuBufferRelease,_wgpuCreateInstance")
endif()

if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
Expand Down Expand Up @@ -312,13 +316,15 @@ else()
target_compile_options(noexcep_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
endif()
target_link_options(onnxruntime_webassembly PRIVATE
--post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js"
"SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js\""
)
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js")
else ()
set(MAXIMUM_MEMORY "4294967296")
target_link_options(onnxruntime_webassembly PRIVATE
--post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js"
"SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js.js\""
)
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js")
endif ()

target_link_options(onnxruntime_webassembly PRIVATE
Expand Down Expand Up @@ -372,7 +378,6 @@ jsepDownload:_pp_")
"SHELL:-s SIGNATURE_CONVERSIONS='${SIGNATURE_CONVERSIONS}'"
)
endif ()
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)

if (onnxruntime_USE_JSEP)
# NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU
Expand All @@ -382,10 +387,8 @@ jsepDownload:_pp_")
target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JSEP=1)
target_link_options(onnxruntime_webassembly PRIVATE
"SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
"SHELL:-s ASYNCIFY=1"
"SHELL:-s ASYNCIFY_STACK_SIZE=65536"
)
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js")

if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
target_link_options(onnxruntime_webassembly PRIVATE
Expand All @@ -397,6 +400,20 @@ jsepDownload:_pp_")

if (onnxruntime_USE_WEBGPU)
target_compile_definitions(onnxruntime_webassembly PRIVATE USE_WEBGPU=1)
target_link_options(onnxruntime_webassembly PRIVATE
"SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js\""
)
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js")
endif()

if (onnxruntime_USE_JSEP OR onnxruntime_USE_WEBGPU OR onnxruntime_USE_WEBNN)
# if any of the above is enabled, we need to use the asyncify library
target_link_options(onnxruntime_webassembly PRIVATE
"SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-async.js\""
"SHELL:-s ASYNCIFY=1"
"SHELL:-s ASYNCIFY_STACK_SIZE=65536"
)
list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-async.js")
endif()

if (onnxruntime_EMSCRIPTEN_SETTINGS)
Expand Down Expand Up @@ -458,6 +475,8 @@ jsepDownload:_pp_")
)
endif()

set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS "${onnxruntime_webassembly_script_deps}")

set(target_name_list ort)

if (onnxruntime_ENABLE_TRAINING_APIS)
Expand Down
113 changes: 112 additions & 1 deletion cmake/patches/dawn/dawn.patch
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ index 6e8ae37593..633af91eef 100644
@@ -77,9 +77,17 @@ if (${DAWN_ENABLE_EMSCRIPTEN})
"${arg_UNPARSED_ARGUMENTS}")
endif()

+ # since Emscripten 4.0.3, file gen_struct_info.py is moved to outside of directory maint.
+ if (EXISTS "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
+ set(EM_GEN_STRUCT_INFO_SCRIPT "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
Expand All @@ -34,3 +34,114 @@ index 6e8ae37593..633af91eef 100644
-q
"${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
"-I=${EM_BUILD_GEN_DIR}/include"
diff --git a/src/emdawnwebgpu/README.md b/src/emdawnwebgpu/README.md
index efd6491cd6..8ebc5d28b6 100644
--- a/src/emdawnwebgpu/README.md
+++ b/src/emdawnwebgpu/README.md
@@ -56,7 +56,7 @@ Set up the build directory using emcmake
mkdir out/cmake-wasm
cd out/cmake-wasm

-# Make sure the path is to the source checkout of Emscripten, not emsdk's release.
+# If using Emscripten v4.0.2 or lower, make sure the path is to the source checkout of Emscripten, not emsdk's release.
emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..

ninja
diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
index f1c5a7d50e..16f2495712 100644
--- a/third_party/emdawnwebgpu/webgpu.cpp
+++ b/third_party/emdawnwebgpu/webgpu.cpp
@@ -131,7 +131,6 @@ class RefCounted : NonMovable {
bool Release() {
if (mRefCount.fetch_sub(1u, std::memory_order_release) == 1u) {
std::atomic_thread_fence(std::memory_order_acquire);
- emwgpuDelete(this);
return true;
}
return false;
@@ -234,6 +233,7 @@ class Ref {
static void Release(T value) {
if (value != nullptr && value->RefCounted::Release()) {
delete value;
+ emwgpuDelete(value);
}
}

@@ -641,7 +641,8 @@ struct WGPUAdapterImpl final : public EventSource, public RefCounted {
struct WGPUBufferImpl final : public EventSource,
public RefCountedWithExternalCount {
public:
- WGPUBufferImpl(const EventSource* source, bool mappedAtCreation);
+ WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal);
+ ~WGPUBufferImpl();

void Destroy();
const void* GetConstMappedRange(size_t offset, size_t size);
@@ -671,6 +672,7 @@ struct WGPUBufferImpl final : public EventSource,
};
MapRequest mPendingMapRequest;
WGPUBufferMapState mMapState;
+ bool mIsExternal;
};

struct WGPUQueueImpl final : public EventSource, public RefCounted {
@@ -1164,11 +1166,15 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) {

WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
bool mappedAtCreation = false) {
- return new WGPUBufferImpl(source, mappedAtCreation);
+ return new WGPUBufferImpl(source, mappedAtCreation, true);
}

WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
- return new WGPUDeviceImpl(source, queue);
+ // This function is only called from JS via `importJsDevice()`, which
+ // needs to increment the external ref count to fix the behavior.
+ WGPUDeviceImpl* device = new WGPUDeviceImpl(source, queue);
+ device->AddExternalRef();
+ return device;
}

WGPUQueue emwgpuCreateQueue(const EventSource* source) {
@@ -1275,15 +1281,22 @@ WGPUAdapterImpl::WGPUAdapterImpl(const EventSource* source)
// WGPUBuffer implementations.
// ----------------------------------------------------------------------------

-WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
+WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal)
: EventSource(source),
mMapState(mappedAtCreation ? WGPUBufferMapState_Mapped
- : WGPUBufferMapState_Unmapped) {
+ : WGPUBufferMapState_Unmapped),
+ mIsExternal(isExternal) {
if (mappedAtCreation) {
mPendingMapRequest = {kNullFutureId, WGPUMapMode_Write};
}
}

+WGPUBufferImpl::~WGPUBufferImpl() {
+ if (!mIsExternal) {
+ Destroy();
+ }
+}
+
void WGPUBufferImpl::Destroy() {
emwgpuBufferDestroy(this);
AbortPendingMap("Buffer was destroyed before mapping was resolved.");
@@ -1504,6 +1517,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
void wgpu##Name##Release(WGPU##Name o) { \
if (o->Release()) { \
delete o; \
+ emwgpuDelete(o); \
} \
}
WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
@@ -1638,7 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) {

WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device,
const WGPUBufferDescriptor* descriptor) {
- WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation);
+ WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation, false);
emwgpuDeviceCreateBuffer(device, descriptor, buffer);
return buffer;
}
79 changes: 79 additions & 0 deletions js/build_webgpu.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
@echo off

rem build_webgpu.bat --- build onnxruntime-web with WebGPU EP
rem
rem Usage:
rem build_webgpu.bat config [clean]
rem
rem Options:
rem config Build configuration, "d" or "r"
rem clean Perform a clean build, "clean" or empty

setlocal enabledelayedexpansion

set ROOT=%~dp0..\
set BUILD_DIR=%ROOT%build_webgpu

:arg1
if ["%~1"]==["d"] (
set CONFIG=Debug
set CONFIG_EXTRA_FLAG=
@rem --enable_wasm_profiling --wasm_run_tests_in_browser
@rem --cmake_extra_defines onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL=1
@rem --enable_wasm_debug_info
goto :arg2
)
if ["%~1"]==["r"] (
set CONFIG=Release
set CONFIG_EXTRA_FLAG=
@rem --enable_wasm_api_exception_catching --disable_rtti
goto :arg2
)
echo Invalid configuration "%~1", must be "d"(Debug) or "r"(Release)
exit /b 1

:arg2
if ["%~2"]==["clean"] (
goto :clean
)
if not exist "%ROOT%js\web\dist" (
goto :npm_ci
)

goto :build_wasm

:clean
if exist "%BUILD_DIR%" (
rd /s /q %BUILD_DIR%
)

pushd %ROOT%
git submodule sync --recursive
git submodule update --init --recursive
popd

:npm_ci
pushd %ROOT%js
call npm ci
popd
pushd %ROOT%js\common
call npm ci
popd
pushd %ROOT%js\web
call npm ci
call npm run pull:wasm
popd

:build_wasm

set PATH=C:\Program Files\Git\usr\bin;%PATH%

call %ROOT%build.bat --config %CONFIG% %CONFIG_EXTRA_FLAG% --skip_submodule_sync --build_wasm --target onnxruntime_webassembly --skip_tests^
--enable_wasm_simd --enable_wasm_threads --use_jsep --use_webnn --use_webgpu --build_dir %BUILD_DIR%

IF NOT "%ERRORLEVEL%" == "0" (
exit /b %ERRORLEVEL%
)

copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.wasm %ROOT%js\web\dist\
copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.mjs %ROOT%js\web\dist\
7 changes: 7 additions & 0 deletions js/web/lib/build-def.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ interface BuildDefinitions {
*/
readonly ENABLE_BUNDLE_WASM_JS: boolean;

/**
* defines whether to use WebGPU EP instead of JSEP for WebGPU backend.
*
* This flag requires the corresponding WebAssembly artifact to be built with `--use_webgpu` flag.
*/
readonly USE_WEBGPU_EP: boolean;

// #endregion

// #region Build definitions for ESM
Expand Down
Loading
Loading