diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index e9148dd7399..6140e26c767 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -97,35 +97,34 @@ jobs: vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp - # TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend. - #ggml-ci-nvidia-webgpu: - # runs-on: [self-hosted, Linux, NVIDIA] + ggml-ci-nvidia-webgpu: + runs-on: [self-hosted, Linux, NVIDIA] - # steps: - # - name: Clone - # id: checkout - # uses: actions/checkout@v6 + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 - # - name: Dawn Dependency - # id: dawn-depends - # run: | - # DAWN_VERSION="v20260317.182325" - # DAWN_OWNER="google" - # DAWN_REPO="dawn" - # DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release" - # echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" - # curl -L -o artifact.tar.gz \ - # "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" - # mkdir dawn - # tar -xvf artifact.tar.gz -C dawn --strip-components=1 + - name: Dawn Dependency + id: dawn-depends + run: | + DAWN_VERSION="v20260317.182325" + DAWN_OWNER="google" + DAWN_REPO="dawn" + DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release" + echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + curl -L -o artifact.tar.gz \ + "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + mkdir dawn + tar -xvf artifact.tar.gz -C dawn --strip-components=1 - # - name: Test - # id: ggml-ci - # run: | - # GG_BUILD_WEBGPU=1 \ - # GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ - # GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \ - # bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + - name: Test + id: ggml-ci + run: | + GG_BUILD_WEBGPU=1 \ + GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ + GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \ + bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp # TODO: provision AMX-compatible machine #ggml-ci-cpu-amx: diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 33311948669..c60f6d2c2b0 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1128,7 +1128,11 @@ struct test_case { } virtual double max_nmse_err(ggml_backend_t backend) { - GGML_UNUSED(backend); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend)); + // See https://github.com/ggml-org/llama.cpp/pull/22976 for explanation. + if (contains_f16 && strcmp(ggml_backend_reg_name(reg), "WebGPU") == 0) { + return std::max(max_nmse_err(), 1e-6); + } return max_nmse_err(); } @@ -1205,6 +1209,18 @@ struct test_case { std::vector sentinels; std::string current_op_name; + bool contains_f16 = false; + + // Used by the WebGPU backend to relax error thresholds on ops on f16 tensors + void check_for_f16_tensor(ggml_context * ctx) { + contains_f16 = false; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_F16) { + contains_f16 = true; + break; + } + } + } void add_sentinel(ggml_context * ctx) { if (mode == MODE_PERF || mode == MODE_GRAD || mode == MODE_SUPPORT) { @@ -1298,6 +1314,7 @@ struct test_case { ggml_tensor * out = build_graph(ctx); current_op_name = op_desc(out); + check_for_f16_tensor(ctx); if (!matches_filter(out, op_names_filter)) { //printf(" %s: skipping\n", op_desc(out).c_str()); @@ -1973,9 +1990,19 @@ struct test_unary : public test_case { } void initialize_tensors(ggml_context * ctx) override { + float min = -150.f; + float max = 150.f; + + // Keep FP16 exp/expm1 inputs in-range so all backends stay finite instead of + // disagreeing on whether overflow saturates to max-F16 or produces +inf. + if (type == GGML_TYPE_F16 && (op == GGML_UNARY_OP_EXP || op == GGML_UNARY_OP_EXPM1)) { + min = -10.f; + max = 10.f; + } + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { // test extended range of values to check for NaNs in GELU - init_tensor_uniform(t, -150.f, 150.f); + init_tensor_uniform(t, min, max); } }