@@ -453,7 +453,7 @@ struct ggml_backend_opencl_context {
453453    cl_kernel kernel_mul_mat_f16_f32_tiled;
454454    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
455455    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
456-     cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4;
456+     cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans ;
457457    cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
458458    cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
459459    cl_kernel kernel_convert_block_q4_0_noshuffle;
@@ -780,6 +780,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
780780        CL_CHECK ((backend_ctx->kernel_restore_block_q4_0   = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q4_0" 
781781        CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4  = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4" 
782782        CL_CHECK ((backend_ctx->kernel_convert_block_mxfp4_trans  = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_mxfp4_trans" 
783+         CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4_trans  = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4_trans" 
783784        CL_CHECK ((backend_ctx->kernel_restore_block_mxfp4  = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_mxfp4" 
784785        CL_CHECK ((backend_ctx->kernel_convert_block_q8_0   = clCreateKernel (backend_ctx->program_cvt , " kernel_convert_block_q8_0" 
785786        CL_CHECK ((backend_ctx->kernel_restore_block_q8_0   = clCreateKernel (backend_ctx->program_cvt , " kernel_restore_block_q8_0" 
@@ -3338,6 +3339,11 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
33383339            tensor->ne [2 ] == 1  && tensor->ne [3 ] == 1 ;
33393340}
33403341
3342+ inline  bool  use_adreno_moe_kernels (const  ggml_backend_opencl_context *backend_ctx, const  ggml_tensor *tensor) {
3343+     int  ne01 = tensor->ne [1 ];
3344+     return  ((strstr (tensor->name , " ffn" NULL ) || (strstr (tensor->name , " as" NULL )) && (ne01 % 64  == 0 );
3345+ }
3346+ 
33413347static  void  ggml_backend_opencl_buffer_set_tensor (ggml_backend_buffer_t  buffer, ggml_tensor * tensor, const  void  * data, size_t  offset, size_t  size) {
33423348    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init (buffer->buft ->device );
33433349
@@ -3641,13 +3647,12 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
36413647        CL_CHECK (err);
36423648
36433649#ifdef  GGML_OPENCL_USE_ADRENO_KERNELS
3644-         if  (strstr (tensor->name , " ffn" NULL ) {
3650+         if  (use_adreno_moe_kernels (backend_ctx, tensor)) {
3651+             cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans ;
3652+ 
36453653            int  ne00 = tensor->ne [0 ];
36463654            int  ne01 = tensor->ne [1 ];
36473655            int  ne02 = tensor->ne [2 ];
3648- 
3649-             cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans ;
3650- 
36513656            CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &data_device));
36523657            CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->q ));
36533658            CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra->e ));
@@ -3815,6 +3820,33 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
38153820            ggml_nbytes (tensor), NULL , &err);
38163821        CL_CHECK (err);
38173822
3823+ #ifdef  GGML_OPENCL_USE_ADRENO_KERNELS
3824+         if  (use_adreno_moe_kernels (backend_ctx, tensor)) {
3825+             cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans ;
3826+ 
3827+             int  ne00 = tensor->ne [0 ];
3828+             int  ne01 = tensor->ne [1 ];
3829+             int  ne02 = tensor->ne [2 ];
3830+             CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
3831+             CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->e ));
3832+             CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &data_device));
3833+             CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_int), &ne00));
3834+             CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_int), &ne01));
3835+ 
3836+             size_t  global_work_size[3 ] = {static_cast <size_t >(((ne01 + 63 ) / 64 ) * 64 ), static_cast <size_t >(ne00 / 32 ), static_cast <size_t >(ne02)};
3837+             size_t  local_work_size[3 ] = {64 , 2 , 1 };
3838+ 
3839+             cl_event evt;
3840+             CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL ,
3841+                 global_work_size, local_work_size, 0 , NULL , &evt));
3842+             CL_CHECK (clWaitForEvents (1 , &evt));
3843+             CL_CHECK (clEnqueueReadBuffer (
3844+                 queue, data_device, CL_TRUE, offset,
3845+                 size, data, 0 , NULL , NULL ));
3846+             CL_CHECK (clReleaseMemObject (data_device));
3847+             return ;
3848+         }
3849+ #endif 
38183850        cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4 ;
38193851        CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra->q ));
38203852        CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_mem), &extra->e ));
@@ -7766,6 +7798,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
77667798
77677799                cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
77687800
7801+                 int  tile_size = 320 ;
77697802                if  (ne12 == 1 ) { //  for gemv
77707803                    kernel = backend_ctx->kernel_gemv_moe_mxfp4_f32 ;
77717804
@@ -7785,7 +7818,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
77857818                    kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32 ;
77867819
77877820                    //  preprocess router table
7788-                     int  tile_size = 320 ;
77897821                    int  num_tiles_per_expert = (ne01 + tile_size - 1 ) / tile_size;
77907822                    void  * host_src2_reorder = malloc (ne20 * ne21 * 4  * num_tiles_per_expert * sizeof (short ));
77917823                    void  * host_src2 = malloc (ne21 * nb21);
@@ -7842,7 +7874,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
78427874                if  (ne12 == 1 ) {
78437875                    CL_CHECK (clSetKernelArg (kernel, arg_idx++, sizeof (int ),       &ne11));
78447876                } else  {
7845-                     CL_CHECK (clSetKernelArg (kernel, arg_idx++, sizeof (int ),       &ne02 ));
7877+                     CL_CHECK (clSetKernelArg (kernel, arg_idx++, sizeof (int ),       &tile_size ));
78467878                }
78477879
78487880                //  launch kernel
0 commit comments