Add CUDA non-contigious Unary ops implementation

YavorGIvanov · YavorGIvanov · commit 64be8c56394c · 2025-07-12T23:44:10.000Z
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3116,7 +3116,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
-                    return ggml_is_contiguous(op->src[0]);
+                    return true;
                 default:
                     return false;
             }
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
@@ -95,9 +95,51 @@ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
 }
 
 template <float (*op)(float), typename T>
-static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
+static __global__ void unary_op_kernel_noncont(
+    const void * x, void * dst,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3,
+    const int64_t nb0_x, const int64_t nb1_x, const int64_t nb2_x, const int64_t nb3_x,
+    const int64_t nb0_d, const int64_t nb1_d, const int64_t nb2_d, const int64_t nb3_d,
+    const int64_t k) {
+
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const int64_t i3 = i / (ne2 * ne1 * ne0);
+    const int64_t i2 = (i / (ne1 * ne0)) % ne2;
+    const int64_t i1 = (i / ne0) % ne1;
+    const int64_t i0 = i % ne0;
+
+    const int64_t offset_x = i0*nb0_x + i1*nb1_x + i2*nb2_x + i3*nb3_x;
+    const int64_t offset_d = i0*nb0_d + i1*nb1_d + i2*nb2_d + i3*nb3_d;
+
+    const T * px = (const T *)((const char *)x + offset_x);
+    T * pd = (T *)((char *)dst + offset_d);
+
+    *pd = (T)op((float)*px);
+}
+
+template <float (*op)(float), typename T>
+static void unary_cuda(const T * x, T * dst, const int k,
+                      const ggml_tensor * src, const ggml_tensor * dst_tensor,
+                      cudaStream_t stream) {
     const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
-    unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+
+    if (ggml_is_contiguous(src) && ggml_is_contiguous(dst_tensor)) {
+        unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+    } else {
+        unary_op_kernel_noncont<op, T><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(
+            (const void *)x, (void *)dst,
+            src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+            src->nb[0], src->nb[1], src->nb[2], src->nb[3],
+            dst_tensor->nb[0], dst_tensor->nb[1],
+            dst_tensor->nb[2], dst_tensor->nb[3],
+            k
+        );
+    }
 }
 
 template <float (*op)(float)>
@@ -107,16 +149,16 @@ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     void * dst_d = dst->data;
     cudaStream_t stream = ctx.stream();
 
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
     GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
     GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
     GGML_ASSERT(src0->type == dst->type);
 
     if (src0->type == GGML_TYPE_F16) {
-        unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
+        unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements(src0),
+                      src0, dst, stream);
     } else {
-        unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
+        unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements(src0),
+                      src0, dst, stream);
     }
 }
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -5642,6 +5642,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
     test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
 
+    test_cases.emplace_back(new test_unary((ggml_unary_op) GGML_UNARY_OP_ABS, GGML_TYPE_F32, {256, 256, 3, 1}, 0));
+    test_cases.emplace_back(new test_unary((ggml_unary_op) GGML_UNARY_OP_ABS, GGML_TYPE_F32, {256, 256, 3, 1}, 1));
+
     return test_cases;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -3116,7 +3116,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3116`	`3116`	`case GGML_UNARY_OP_GELU_QUICK:`
`3117`	`3117`	`case GGML_UNARY_OP_TANH:`
`3118`	`3118`	`case GGML_UNARY_OP_EXP:`
`3119`		`- return ggml_is_contiguous(op->src[0]);`
	`3119`	`+ return true;`
`3120`	`3120`	`default:`
`3121`	`3121`	`return false;`
`3122`	`3122`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5642,6 +5642,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {`
`5642`	`5642`
`5643`	`5643`	`test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));`
`5644`	`5644`
	`5645`	`+ test_cases.emplace_back(new test_unary((ggml_unary_op) GGML_UNARY_OP_ABS, GGML_TYPE_F32, {256, 256, 3, 1}, 0));`
	`5646`	`+ test_cases.emplace_back(new test_unary((ggml_unary_op) GGML_UNARY_OP_ABS, GGML_TYPE_F32, {256, 256, 3, 1}, 1));`
	`5647`	`+`
`5645`	`5648`	`return test_cases;`
`5646`	`5649`	`}`
`5647`	`5650`