@@ -356,33 +356,6 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
356356    }
357357}
358358
359- template <typename  T>
360- static  void  gated_op_fused_geglu (const  T * x, const  T * g, T * dst, const  uint64_t  k, const  uint64_t  n, const  uint64_t  o0, const  uint64_t  o1, const  sycl::nd_item<1 > &item_ct1) {
361-     SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
362-         const  int64_t  j0 = (i / n) * o0 + (i % n);
363-         const  int64_t  j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
364-         dst[i] = op_gelu (x[j0]) * g[j1];
365-     }
366- }
367- 
368- template <typename  T>
369- static  void  gated_op_fused_reglu (const  T * x, const  T * g, T * dst, const  uint64_t  k, const  uint64_t  n, const  uint64_t  o0, const  uint64_t  o1, const  sycl::nd_item<1 > &item_ct1) {
370-     SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
371-         const  int64_t  j0 = (i / n) * o0 + (i % n);
372-         const  int64_t  j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
373-         dst[i] = op_relu (x[j0]) * g[j1];
374-     }
375- }
376- 
377- template <typename  T>
378- static  void  gated_op_fused_swiglu (const  T * x, const  T * g, T * dst, const  uint64_t  k, const  uint64_t  n, const  uint64_t  o0, const  uint64_t  o1, const  sycl::nd_item<1 > &item_ct1) {
379-     SYCL_GLOBAL_ID_LOOP (k, item_ct1)  {
380-         const  int64_t  j0 = (i / n) * o0 + (i % n);
381-         const  int64_t  j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
382-         dst[i] = op_silu (x[j0]) * g[j1];
383-     }
384- }
385- 
386359namespace  ggml_sycl_detail  {
387360static  void  acc_f32_sycl (const  float  *x, const  float  *y, float  *dst,
388361                         const  int  n_elements, const  int  ne10, const  int  ne11,
@@ -457,85 +430,6 @@ static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx,
457430    }
458431}
459432
460- template <typename  KernelInvoker, typename ... Args>
461- static  inline  void  dispatch_ggml_sycl_op_fused_glu (ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
462- #if  defined (GGML_SYCL_F16)
463-     GGML_ASSERT (dst->src [0 ]->type  == GGML_TYPE_F32 || dst->src [0 ]->type  == GGML_TYPE_F16);
464-     GGML_ASSERT (dst->type  == GGML_TYPE_F32 || dst->type  == GGML_TYPE_F16);
465- #else 
466-     GGML_ASSERT (dst->src [0 ]->type  == GGML_TYPE_F32);
467-     GGML_ASSERT (dst->type  == GGML_TYPE_F32);
468- #endif 
469-     GGML_ASSERT (dst->src [0 ]->type  == dst->type );
470-     dpct::queue_ptr main_stream = ctx.stream ();
471-     SYCL_CHECK (ggml_sycl_set_device (ctx.device ));
472-     const  ggml_tensor * src0 = dst->src [0 ];
473-     const  ggml_tensor * src1 = dst->src [1 ];
474-     const  int64_t  nc = src1 ? src0->ne [0 ] : src0->ne [0 ] / 2 ;;
475-     GGML_ASSERT (dst->ne [0 ] == nc);
476-     GGML_ASSERT (ggml_is_contiguous_1 (dst->src [0 ]));
477-     GGML_ASSERT (ggml_is_contiguous (dst));
478-     const  int32_t  swapped = ((const  int32_t  *) dst->op_params )[1 ];
479-     void  * src0_d = src0->data ;
480-     void  * src1_d = src1 ? src1->data  : src0->data ;
481-     const  int64_t  src0_o = src0->nb [1 ];
482-     const  int64_t  src1_o = src1 ? src1->nb [1 ] : src0->nb [1 ];
483-     void  * dst_d = dst->data ;
484-     if  (src1) {
485-         GGML_ASSERT (ggml_is_contiguous_1 (src1));
486-         GGML_ASSERT (src1->nb [0 ] == ggml_element_size (src1));
487-         GGML_ASSERT (src1->ne [0 ] == nc);
488-         GGML_ASSERT (src0->type  == src1->type );
489-     }
490-     switch  (dst->type ) {
491- #if  defined (GGML_SYCL_F16)
492-         case  GGML_TYPE_F16:
493-             {
494-                 sycl::half * src0_p = (sycl::half *) src0_d;
495-                 sycl::half * src1_p = (sycl::half *) src1_d;
496- 
497-                     if  (!src1) {
498-                         src0_p += swapped ? nc : 0 ;
499-                         src1_p += swapped ? 0  : nc;
500-                     }
501-                 kernel_invoker (src0_p,
502-                                src1_p,
503-                                (sycl::half *) dst_d,
504-                                ggml_nelements (dst),
505-                                nc,
506-                                src0_o / sizeof (sycl::half),
507-                                src1_o / sizeof (sycl::half),
508-                                main_stream,
509-                                std::forward<Args>(args)...);
510-                 break ;
511-             }
512- #endif 
513-         case  GGML_TYPE_F32:
514-             {
515-                 float  * src0_p = (float  *) src0_d;
516-                 float  * src1_p = (float  *) src1_d;
517- 
518-                     if  (!src1) {
519-                         src0_p += swapped ? nc : 0 ;
520-                         src1_p += swapped ? 0  : nc;
521-                     }
522- 
523-                 kernel_invoker (src0_p,
524-                                src1_p,
525-                                (float  *) dst_d,
526-                                ggml_nelements (dst),
527-                                nc,
528-                                src0_o / sizeof (float ),
529-                                src1_o / sizeof (float ),
530-                                main_stream,
531-                                std::forward<Args>(args)...);
532-                 break ;
533-             }
534-         default :
535-             GGML_ABORT (" GGML tensor type not supported!\n " 
536-     }
537- }
538- 
539433template <typename  KernelInvoker, typename ... Args>
540434static  inline  void  dispatch_ggml_sycl_op_upscale (ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
541435#if  defined (GGML_SYCL_F16)
@@ -945,40 +839,6 @@ static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor
945839    ggml_sycl_detail::acc_f32_sycl (src0_dd, src1_dd, dst_dd, (int )ggml_nelements (dst), (int )dst->src [1 ]->ne [0 ], (int )dst->src [1 ]->ne [1 ], (int )dst->src [1 ]->ne [2 ], nb1, nb2, offset, main_stream);
946840}
947841
948- static  inline  void  ggml_sycl_op_geglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
949-     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
950-         [](const  auto * x_ptr, const  auto * g_ptr, auto * dst_ptr, uint64_t  k, uint64_t  n, uint64_t  o0, uint64_t  o1, queue_ptr main_stream) {
951-             const  uint32_t  num_blocks = ceil_div (k, SYCL_GELU_BLOCK_SIZE);
952-             sycl_parallel_for (main_stream,
953-                     sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_GELU_BLOCK_SIZE)), sycl::range<1 >(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
954-                 gated_op_fused_geglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
955-             });
956-         });
957- }
958- 
959- static  inline  void  ggml_sycl_op_reglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
960-     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
961-         [](const  auto * x_ptr, const  auto * g_ptr, auto * dst_ptr, uint64_t  k, uint64_t  n, uint64_t  o0, uint64_t  o1, queue_ptr main_stream) {
962-             const  uint32_t  num_blocks = ceil_div ((uint32_t )k, SYCL_RELU_BLOCK_SIZE); //  Using RELU block size for reglu
963-             sycl_parallel_for (main_stream,
964-                     sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_RELU_BLOCK_SIZE)), sycl::range<1 >(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
965-                 gated_op_fused_reglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
966-             });
967-         });
968- }
969- 
970- static  inline  void  ggml_sycl_op_swiglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
971-     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
972-         [](const  auto * x_ptr, const  auto * g_ptr, auto * dst_ptr, uint64_t  k, uint64_t  n, uint64_t  o0, uint64_t  o1, queue_ptr main_stream) {
973-             const  uint32_t  num_blocks = ceil_div ((uint32_t )k, SYCL_SILU_BLOCK_SIZE); //  Using SILU block size for swiglu
974-             sycl_parallel_for (main_stream,
975-                     sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_SILU_BLOCK_SIZE)), sycl::range<1 >(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
976-                 gated_op_fused_swiglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
977-             });
978-         });
979- }
980- 
981- 
982842void  ggml_sycl_sqrt (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
983843    scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
984844    ggml_sycl_op_sqrt (ctx, dst);
@@ -1104,17 +964,3 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1104964    ggml_sycl_op_elu (ctx, dst);
1105965}
1106966
1107- void  ggml_sycl_geglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1108-     scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1109-     ggml_sycl_op_geglu (ctx, dst);
1110- }
1111- 
1112- void  ggml_sycl_reglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1113-     scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1114-     ggml_sycl_op_reglu (ctx, dst);
1115- }
1116- 
1117- void  ggml_sycl_swiglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1118-     scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1119-     ggml_sycl_op_swiglu (ctx, dst);
1120- }
0 commit comments