onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -12,21 +12,12 @@ Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const
  
      shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);

      shader.AddOutput("output", ShaderUsage::UseUniform);

      shader.AddOutput("scales", ShaderUsage::UseUniform);

      shader.AdditionalImplementation() << R"ADDNL_FN(

            fn readInput(offset: u32) -> input_a_value_t

            {

                if (offset > uniforms.input_size) {

                    return input_a_value_t(0);

                }

                return input_a[offset];

            }

        )ADDNL_FN";

      shader.MainFunctionBody() << R"MAIN_FN(

            var local_a : array<vec4<input_a_element_t>, 32>;

            var max_value:vec4<input_a_element_t> = vec4<input_a_element_t>(0);

            for (var idx:u32=0;idx<32;idx+=1)

            {

                local_a[idx] = readInput(workgroup_idx*32 + idx);

                local_a[idx] = input_a[workgroup_idx*32 + idx];

                max_value = max(max_value, abs(local_a[idx]));

            }

            var scale = max(max_value.x, max_value.y);

    @@ -279,8 +270,7 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
  
      Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);

      quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)}})

          .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},

                       {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}})

          .AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});

                       {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}});

      ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));

      constexpr uint32_t kTileSize = 64;

    @@ -317,7 +307,7 @@ bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
  
      bool use_dp4a = context.Device().HasFeature(wgpu::FeatureName::Subgroups) &&

                      context.AdapterInfo().backendType != wgpu::BackendType::Metal;

      return (accuracy_level == 4 && block_size % 32 == 0 &&

              batch_count == 1 && components_k == 4 && K % 64 == 0 && N % 16 == 0 &&

              batch_count == 1 && components_k == 4 && K % 128 == 0 && N % 16 == 0 &&

              !has_zero_points && use_dp4a);

    }

onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h

-Original file line number
+Diff line change
@@ Expand Up @@
      public:
       DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
       Status GenerateShaderCode(ShaderHelper& sh) const override;
-      WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"input_size", ProgramUniformVariableDataType::Uint32});
     };
     class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
@@ Expand Down @@

onnxruntime/test/contrib_ops/matmul_4bits_test.cc

-Original file line number
+Diff line change
@@ Expand Up / @@ -389,6 +389,7 @@ TEST(MatMulNBits, Float32_Accuracy4) { @@
       TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();
+      TestMatMulNBitsTyped<float, 100, 288, 192, 64, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
@@ Expand Down Expand Up / @@ -458,6 +459,7 @@ TEST(MatMulNBits, Float16_Accuracy4) { @@
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 4>();
+      TestMatMulNBitsTyped<MLFloat16, 100, 288, 192, 64, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 4>();
@@ Expand Down @@

[webgpu] Limit that K must be divisible by 128 to apply dp4a matmul #24078

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

guschmue merged 1 commit into main from fix_quantize

Mar 18, 2025

-Original file line number
+Diff line change
@@ Expand Up @@
      public:
       DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
       Status GenerateShaderCode(ShaderHelper& sh) const override;
-      WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"input_size", ProgramUniformVariableDataType::Uint32});
     };
     class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -389,6 +389,7 @@ TEST(MatMulNBits, Float32_Accuracy4) { @@
       TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();
+      TestMatMulNBitsTyped<float, 100, 288, 192, 64, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();
       TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
@@ Expand Down Expand Up / @@ -458,6 +459,7 @@ TEST(MatMulNBits, Float16_Accuracy4) { @@
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 4>();
+      TestMatMulNBitsTyped<MLFloat16, 100, 288, 192, 64, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 4>();
       TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 4>();
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[webgpu] Limit that K must be divisible by 128 to apply dp4a matmul #24078

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!