@@ -95,9 +95,51 @@ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
9595}
9696
9797template  <float  (*op)(float ), typename  T>
98- static  void  unary_cuda (const  T * x, T * dst, const  int  k, cudaStream_t stream) {
98+ static  __global__  void  unary_op_kernel_noncont (
99+     const  void  * x, void  * dst,
100+     const  int64_t  ne0, const  int64_t  ne1, const  int64_t  ne2, const  int64_t  ne3,
101+     const  int64_t  nb0_x, const  int64_t  nb1_x, const  int64_t  nb2_x, const  int64_t  nb3_x,
102+     const  int64_t  nb0_d, const  int64_t  nb1_d, const  int64_t  nb2_d, const  int64_t  nb3_d,
103+     const  int64_t  k) {
104+ 
105+     const  int64_t  i = blockDim .x *blockIdx .x  + threadIdx .x ;
106+ 
107+     if  (i >= k) {
108+         return ;
109+     }
110+ 
111+     const  int64_t  i3 = i / (ne2 * ne1 * ne0);
112+     const  int64_t  i2 = (i / (ne1 * ne0)) % ne2;
113+     const  int64_t  i1 = (i / ne0) % ne1;
114+     const  int64_t  i0 = i % ne0;
115+ 
116+     const  int64_t  offset_x = i0*nb0_x + i1*nb1_x + i2*nb2_x + i3*nb3_x;
117+     const  int64_t  offset_d = i0*nb0_d + i1*nb1_d + i2*nb2_d + i3*nb3_d;
118+ 
119+     const  T * px = (const  T *)((const  char  *)x + offset_x);
120+     T * pd = (T *)((char  *)dst + offset_d);
121+ 
122+     *pd = (T)op ((float )*px);
123+ }
124+ 
125+ template  <float  (*op)(float ), typename  T>
126+ static  void  unary_cuda (const  T * x, T * dst, const  int  k,
127+                       const  ggml_tensor * src, const  ggml_tensor * dst_tensor,
128+                       cudaStream_t stream) {
99129    const  int  num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1 ) / CUDA_NEG_BLOCK_SIZE;
100-     unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
130+ 
131+     if  (ggml_is_contiguous (src) && ggml_is_contiguous (dst_tensor)) {
132+         unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
133+     } else  {
134+         unary_op_kernel_noncont<op, T><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0 , stream>>> (
135+             (const  void  *)x, (void  *)dst,
136+             src->ne [0 ], src->ne [1 ], src->ne [2 ], src->ne [3 ],
137+             src->nb [0 ], src->nb [1 ], src->nb [2 ], src->nb [3 ],
138+             dst_tensor->nb [0 ], dst_tensor->nb [1 ],
139+             dst_tensor->nb [2 ], dst_tensor->nb [3 ],
140+             k
141+         );
142+     }
101143}
102144
103145template  <float  (*op)(float )>
@@ -107,16 +149,16 @@ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
107149    void  * dst_d = dst->data ;
108150    cudaStream_t stream = ctx.stream ();
109151
110-     GGML_ASSERT (ggml_is_contiguous (src0));
111- 
112152    GGML_ASSERT (src0->type  == GGML_TYPE_F32 || src0->type  == GGML_TYPE_F16);
113153    GGML_ASSERT ( dst->type  == GGML_TYPE_F32 ||  dst->type  == GGML_TYPE_F16);
114154    GGML_ASSERT (src0->type  == dst->type );
115155
116156    if  (src0->type  == GGML_TYPE_F16) {
117-         unary_cuda<op>((const  half *)src0_d, (half *)dst_d, ggml_nelements (src0), stream);
157+         unary_cuda<op>((const  half *)src0_d, (half *)dst_d, ggml_nelements (src0),
158+                       src0, dst, stream);
118159    } else  {
119-         unary_cuda<op>((const  float  *)src0_d, (float  *)dst_d, ggml_nelements (src0), stream);
160+         unary_cuda<op>((const  float  *)src0_d, (float  *)dst_d, ggml_nelements (src0),
161+                       src0, dst, stream);
120162    }
121163}
122164
0 commit comments