diff --git a/include/ggml.h b/include/ggml.h index 11d3e8a816..75710dbe39 100644 --- a/include/ggml.h +++ b/include/ggml.h @@ -577,6 +577,17 @@ extern "C" { GGML_OP_GLU, + // Supertonic-specific fused ops (QVAC overlay). These collapse + // multi-op sub-graphs that the Supertonic ggml port emits per + // ConvNeXt block / attention block, reducing per-step Metal + // command-buffer encode overhead. See + // tts-cpp/cmake/vcpkg-overlay-ports/ggml/ggml-supertonic-ops.patch. + GGML_OP_SUPERTONIC_DEPTHWISE_1D, + GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL, + GGML_OP_SUPERTONIC_PW2_RESIDUAL, + GGML_OP_SUPERTONIC_BIAS_GELU, + GGML_OP_SUPERTONIC_EDGE_PAD_1D, + GGML_OP_COUNT, }; @@ -2252,6 +2263,138 @@ extern "C" { int p0, int p1); + // Supertonic fused depthwise 1D convolution with edge-clamp (replicate) + // padding and bias add. Per-channel filter of width K applied to the + // time dim of a (ne0 = L, ne1 = C, ne2 = 1, ne3 = 1). + // y[t, c] = bias[c] + // + sum_{k=0..K-1} a[clamp(t + (k - K/2)*dilation, 0, L-1), c] + // * w[k, c] + // + // a: [L, C] f32 contiguous + // w: [K, C] f32 contiguous (or [K, 1, C, 1] from a depthwise conv weight) + // bias: [C] f32 contiguous (may be NULL — pass GGML_NULL_TENSOR to skip) + // dilation: positive integer + // + // Output ne=[L, C] matching a's shape. Currently supports K in {3, 5}. + GGML_API struct ggml_tensor * ggml_supertonic_depthwise_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation); + + // [C, T]-layout variant: a is [C, L, 1, 1] with C inner-most. Same kernel, + // strides flipped via a layout flag in op_params. Full B2 path. + GGML_API struct ggml_tensor * ggml_supertonic_depthwise_1d_ct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation); + + // [C, T]-layout + causal-left padding variant. Used by the vocoder + // ConvNeXt chain. K may be 3, 5, or 7. + GGML_API struct ggml_tensor * ggml_supertonic_depthwise_1d_causal_ct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation); + + // Supertonic fused channel-axis layer norm. Normalises across the + // channel dim (ne[1]) of a [L, C, 1, 1] tensor and applies an affine + // scale + shift, all in one Metal dispatch. Replaces the + // permute + cont + ggml_norm + mul + add + permute + cont chain that + // stock ggml_norm requires (since it normalises along ne[0]). + // + // y[t, c] = ((a[t, c] - mean_t) / sqrt(var_t + eps)) * g[c] + b[c] + // + // a: [L, C, 1, 1] f32 contiguous + // g: [C] f32 contiguous (scale) + // b: [C] f32 contiguous (shift) + // eps: numerical epsilon (passed as float op param) + GGML_API struct ggml_tensor * ggml_supertonic_layer_norm_channel( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * g, + struct ggml_tensor * b, + float eps); + + // [C, T]-layout variant: a is [C, T, 1, 1] with C inner-most. g and b + // still have length C (== a->ne[0] here, vs == a->ne[1] in the [T, C] + // variant). Same op + Metal kernel under the hood — the kernel reads + // per-axis strides from kargs, so this variant just sets a layout + // flag so the dispatch passes the right strides. Phase B2 path. + GGML_API struct ggml_tensor * ggml_supertonic_layer_norm_channel_ct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * g, + struct ggml_tensor * b, + float eps); + + // Supertonic fused (x + bias) * gamma + residual. Channel-axis + // broadcasts for `bias` and `gamma` (both [C]); `x` and `residual` + // are [L, C, 1, 1] f32 contiguous. Output ne matches `x`. + // + // y[t, c] = residual[t, c] + (x[t, c] + bias[c]) * gamma[c] + GGML_API struct ggml_tensor * ggml_supertonic_pw2_residual( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias, + struct ggml_tensor * gamma, + struct ggml_tensor * residual); + + // [C, T]-layout variant. Bias/gamma still have length C (== x->ne[0] + // here, vs == x->ne[1] in the [T, C] variant). Full B2 path. + GGML_API struct ggml_tensor * ggml_supertonic_pw2_residual_ct( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias, + struct ggml_tensor * gamma, + struct ggml_tensor * residual); + + // Supertonic fused bias-add + GELU (erf form, the gelu_erf in ggml). + // Channel-axis broadcasts for `bias` ([C]); `x` is [L, C, 1, 1] f32 + // contiguous. Output ne matches `x`. + // + // y[t, c] = gelu_erf(x[t, c] + bias[c]) + // = 0.5 * v * (1 + erf(v * 1/sqrt(2))) where v = x + bias + GGML_API struct ggml_tensor * ggml_supertonic_bias_gelu( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias); + + // [C, T]-layout variant. Bias still has length C (== x->ne[0] here). + GGML_API struct ggml_tensor * ggml_supertonic_bias_gelu_ct( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias); + + // Supertonic edge-replicate padding for 1D conv inputs. Combines + // the view + repeat_4d + concat sequence used by vocoder's causal + // padding (pad_left only) and vector_estimator / text_encoder's + // symmetric edge_clamp padding (pad_left + pad_right) into one + // dispatch. Input `x` is `[L_in, C, 1, 1]` f32 contiguous; the + // output has `ne = [L_in + pad_left + pad_right, C, 1, 1]` where: + // + // y[t, c] = x[clamp(t - pad_left, 0, L_in - 1), c] + // + // (Replicate / edge-clamp semantics — the leftmost row of `x` fills + // the left pad, the rightmost row fills the right pad.) + GGML_API struct ggml_tensor * ggml_supertonic_edge_pad_1d( + struct ggml_context * ctx, + struct ggml_tensor * x, + int pad_left, + int pad_right); + + // [C, T]-layout variant. Input x is [C, L_in, 1, 1]; output is + // [C, L_in + pad_left + pad_right, 1, 1]. Full B2 path. + GGML_API struct ggml_tensor * ggml_supertonic_edge_pad_1d_ct( + struct ggml_context * ctx, + struct ggml_tensor * x, + int pad_left, + int pad_right); + // Move tensor elements by an offset given for each dimension. Elements that // are shifted beyond the last position are wrapped around to the beginning. GGML_API struct ggml_tensor * ggml_roll( diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c index 2b3eb5b5ce..1a6b14297d 100644 --- a/src/ggml-cpu/ggml-cpu.c +++ b/src/ggml-cpu/ggml-cpu.c @@ -1942,6 +1942,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad_reflect_1d(params, tensor); } break; + case GGML_OP_SUPERTONIC_DEPTHWISE_1D: + { + ggml_compute_forward_supertonic_depthwise_1d(params, tensor); + } break; + case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL: + { + ggml_compute_forward_supertonic_layer_norm_channel(params, tensor); + } break; + case GGML_OP_SUPERTONIC_PW2_RESIDUAL: + { + ggml_compute_forward_supertonic_pw2_residual(params, tensor); + } break; + case GGML_OP_SUPERTONIC_BIAS_GELU: + { + ggml_compute_forward_supertonic_bias_gelu(params, tensor); + } break; + case GGML_OP_SUPERTONIC_EDGE_PAD_1D: + { + ggml_compute_forward_supertonic_edge_pad_1d(params, tensor); + } break; case GGML_OP_ROLL: { ggml_compute_forward_roll(params, tensor); @@ -2347,6 +2367,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: + case GGML_OP_SUPERTONIC_DEPTHWISE_1D: + case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL: + case GGML_OP_SUPERTONIC_PW2_RESIDUAL: + case GGML_OP_SUPERTONIC_BIAS_GELU: + case GGML_OP_SUPERTONIC_EDGE_PAD_1D: case GGML_OP_ROLL: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp index 0b5d6c6df8..332f9dd3c9 100644 --- a/src/ggml-cpu/ops.cpp +++ b/src/ggml-cpu/ops.cpp @@ -7863,6 +7863,274 @@ void ggml_compute_forward_pad_reflect_1d( } } +// ggml_compute_forward_supertonic_depthwise_1d + +void ggml_compute_forward_supertonic_depthwise_1d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * x = dst->src[0]; // [L, C, 1, 1] + const ggml_tensor * w = dst->src[1]; // [K, 1, C, 1] + const ggml_tensor * bias = dst->src[2]; // [C] or NULL + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(w->type == GGML_TYPE_F32); + GGML_ASSERT(bias == NULL || bias->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int32_t * opts = (const int32_t *) dst->op_params; + const int K = opts[0]; + const int dilation = opts[1]; + const int32_t layout = opts[2]; + const int32_t causal = opts[3]; + const int k_off = (causal != 0) ? -(K - 1) : -(K / 2); + + int L, C, sxt, sxc, syt, syc; + if (layout == 0) { + L = (int) x->ne[0]; + C = (int) x->ne[1]; + sxt = 1; sxc = L; + syt = 1; syc = L; + } else { + C = (int) x->ne[0]; + L = (int) x->ne[1]; + sxt = C; sxc = 1; + syt = C; syc = 1; + } + + const int ith = params->ith; + const int nth = params->nth; + + const float * x_data = (const float *) x->data; + const float * w_data = (const float *) w->data; + const float * b_data = bias ? (const float *) bias->data : NULL; + float * y_data = (float *) dst->data; + + for (int c = ith; c < C; c += nth) { + const float bias_v = b_data ? b_data[c] : 0.0f; + const float * w_c = w_data + (size_t) c * K; + for (int t = 0; t < L; ++t) { + float sum = bias_v; + for (int k = 0; k < K; ++k) { + int s = t + (k + k_off) * dilation; + if (s < 0) s = 0; + else if (s >= L) s = L - 1; + sum += x_data[(size_t) s * sxt + (size_t) c * sxc] * w_c[k]; + } + y_data[(size_t) t * syt + (size_t) c * syc] = sum; + } + } +} + +// ggml_compute_forward_supertonic_layer_norm_channel + +void ggml_compute_forward_supertonic_layer_norm_channel( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * x = dst->src[0]; // [L, C, 1, 1] + const ggml_tensor * g = dst->src[1]; // [C] + const ggml_tensor * b = dst->src[2]; // [C] + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(g->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float eps; + memcpy(&eps, dst->op_params, sizeof(eps)); + const int32_t layout = ((const int32_t *) dst->op_params)[1]; + + int L, C, sxt, sxc, syt, syc; + if (layout == 0) { + // [T, C]: T inner. + L = (int) x->ne[0]; + C = (int) x->ne[1]; + sxt = 1; sxc = L; + syt = 1; syc = L; + } else { + // [C, T]: C inner. + C = (int) x->ne[0]; + L = (int) x->ne[1]; + sxt = C; sxc = 1; + syt = C; syc = 1; + } + + const int ith = params->ith; + const int nth = params->nth; + + const float * x_data = (const float *) x->data; + const float * g_data = (const float *) g->data; + const float * b_data = (const float *) b->data; + float * y_data = (float *) dst->data; + + // Layout-agnostic indexing via element strides. + for (int t = ith; t < L; t += nth) { + double mean = 0.0; + for (int c = 0; c < C; ++c) mean += x_data[(size_t) t * sxt + (size_t) c * sxc]; + mean /= (double) C; + double var = 0.0; + for (int c = 0; c < C; ++c) { + const double d = (double) x_data[(size_t) t * sxt + (size_t) c * sxc] - mean; + var += d * d; + } + const float inv = 1.0f / sqrtf((float) (var / (double) C) + eps); + for (int c = 0; c < C; ++c) { + const float xv = x_data[(size_t) t * sxt + (size_t) c * sxc]; + y_data[(size_t) t * syt + (size_t) c * syc] = (xv - (float) mean) * inv * g_data[c] + b_data[c]; + } + } +} + +// ggml_compute_forward_supertonic_pw2_residual + +void ggml_compute_forward_supertonic_pw2_residual( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * x = dst->src[0]; // [L, C, 1, 1] + const ggml_tensor * bias = dst->src[1]; // [C] + const ggml_tensor * gamma = dst->src[2]; // [C] + const ggml_tensor * residual = dst->src[3]; // [L, C, 1, 1] + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(bias->type == GGML_TYPE_F32); + GGML_ASSERT(gamma->type == GGML_TYPE_F32); + GGML_ASSERT(residual->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int32_t layout = ((const int32_t *) dst->op_params)[0]; + + int L, C, sxt, sxc, syt, syc, srt, src; + if (layout == 0) { + L = (int) x->ne[0]; + C = (int) x->ne[1]; + sxt = 1; sxc = L; + syt = 1; syc = L; + srt = 1; src = L; + } else { + C = (int) x->ne[0]; + L = (int) x->ne[1]; + sxt = C; sxc = 1; + syt = C; syc = 1; + srt = C; src = 1; + } + + const int ith = params->ith; + const int nth = params->nth; + + const float * x_data = (const float *) x->data; + const float * b_data = (const float *) bias->data; + const float * g_data = (const float *) gamma->data; + const float * r_data = (const float *) residual->data; + float * y_data = (float *) dst->data; + + // Stripe over channels. For each channel c, bias and gamma are read + // once and applied across all L timesteps; layout flag flips x/y/r index + // strides between [T, C] and [C, T]. + for (int c = ith; c < C; c += nth) { + const float bv = b_data[c]; + const float gv = g_data[c]; + for (int t = 0; t < L; ++t) { + const float xv = x_data[(size_t) t * sxt + (size_t) c * sxc]; + const float rv = r_data[(size_t) t * srt + (size_t) c * src]; + y_data[(size_t) t * syt + (size_t) c * syc] = rv + (xv + bv) * gv; + } + } +} + +// ggml_compute_forward_supertonic_bias_gelu + +void ggml_compute_forward_supertonic_bias_gelu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * x = dst->src[0]; // [L, C, 1, 1] + const ggml_tensor * bias = dst->src[1]; // [C] + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(bias->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int32_t layout = ((const int32_t *) dst->op_params)[0]; + + int L, C, sxt, sxc, syt, syc; + if (layout == 0) { + L = (int) x->ne[0]; + C = (int) x->ne[1]; + sxt = 1; sxc = L; + syt = 1; syc = L; + } else { + C = (int) x->ne[0]; + L = (int) x->ne[1]; + sxt = C; sxc = 1; + syt = C; syc = 1; + } + + const int ith = params->ith; + const int nth = params->nth; + + const float * x_data = (const float *) x->data; + const float * b_data = (const float *) bias->data; + float * y_data = (float *) dst->data; + + static const float inv_sqrt_2 = 0.7071067811865475f; + + for (int c = ith; c < C; c += nth) { + const float bv = b_data[c]; + for (int t = 0; t < L; ++t) { + const float v = x_data[(size_t) t * sxt + (size_t) c * sxc] + bv; + y_data[(size_t) t * syt + (size_t) c * syc] = 0.5f * v * (1.0f + erff(v * inv_sqrt_2)); + } + } +} + +// ggml_compute_forward_supertonic_edge_pad_1d + +void ggml_compute_forward_supertonic_edge_pad_1d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * x = dst->src[0]; // [L_in, C, 1, 1] or [C, L_in, 1, 1] + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int pad_left = ggml_get_op_params_i32(dst, 0); + const int32_t layout = ggml_get_op_params_i32(dst, 2); + + int L_in, L_out, C, sxt, sxc, syt, syc; + if (layout == 0) { + L_in = (int) x->ne[0]; + C = (int) x->ne[1]; + L_out = (int) dst->ne[0]; + sxt = 1; sxc = L_in; syt = 1; syc = L_out; + } else { + C = (int) x->ne[0]; + L_in = (int) x->ne[1]; + L_out = (int) dst->ne[1]; + sxt = C; sxc = 1; syt = C; syc = 1; + } + + const int ith = params->ith; + const int nth = params->nth; + + const float * x_data = (const float *) x->data; + float * y_data = (float *) dst->data; + + // Stripe over channels. For each output time t, read from + // clamp(t - pad_left, 0, L_in - 1); layout flag picks strides. + for (int c = ith; c < C; c += nth) { + for (int t = 0; t < L_out; ++t) { + int src_t = t - pad_left; + if (src_t < 0) src_t = 0; + if (src_t >= L_in) src_t = L_in - 1; + y_data[(size_t) t * syt + (size_t) c * syc] = + x_data[(size_t) src_t * sxt + (size_t) c * sxc]; + } + } +} + // ggml_compute_forward_roll static int64_t ggml_wrap_index(int64_t i, int64_t ne) { diff --git a/src/ggml-cpu/ops.h b/src/ggml-cpu/ops.h index 3fa1443abc..40d7026f9a 100644 --- a/src/ggml-cpu/ops.h +++ b/src/ggml-cpu/ops.h @@ -77,6 +77,11 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_supertonic_depthwise_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_supertonic_layer_norm_channel(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_supertonic_pw2_residual(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_supertonic_bias_gelu(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_supertonic_edge_pad_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/src/ggml-metal/ggml-metal-device.cpp b/src/ggml-metal/ggml-metal-device.cpp index 5d9336314d..14a63e0643 100644 --- a/src/ggml-metal/ggml-metal-device.cpp +++ b/src/ggml-metal/ggml-metal-device.cpp @@ -1856,6 +1856,40 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d(g return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_depthwise_1d(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_SUPERTONIC_DEPTHWISE_1D); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_supertonic_depthwise_1d_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_layer_norm_channel(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_supertonic_layer_norm_channel_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag_mask_inf(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_DIAG_MASK_INF); GGML_UNUSED(op); @@ -1870,6 +1904,57 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag_mask_inf(gg return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_pw2_residual(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_SUPERTONIC_PW2_RESIDUAL); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_supertonic_pw2_residual_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_bias_gelu(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_SUPERTONIC_BIAS_GELU); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_supertonic_bias_gelu_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_edge_pad_1d(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_SUPERTONIC_EDGE_PAD_1D); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_supertonic_edge_pad_1d_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_ARANGE); diff --git a/src/ggml-metal/ggml-metal-device.h b/src/ggml-metal/ggml-metal-device.h index b6d6b3d517..8bf3603af6 100644 --- a/src/ggml-metal/ggml-metal-device.h +++ b/src/ggml-metal/ggml-metal-device.h @@ -152,6 +152,11 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_depthwise_1d(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_layer_norm_channel(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_pw2_residual(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_bias_gelu(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_edge_pad_1d(ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag_mask_inf (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/src/ggml-metal/ggml-metal-device.m b/src/ggml-metal/ggml-metal-device.m index 089854e828..21e71c37e0 100644 --- a/src/ggml-metal/ggml-metal-device.m +++ b/src/ggml-metal/ggml-metal-device.m @@ -1134,6 +1134,28 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_SUPERTONIC_DEPTHWISE_1D: + { + const int K = ggml_get_op_params_i32(op, 0); + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + (op->src[2] == NULL || op->src[2]->type == GGML_TYPE_F32) && + (K == 3 || K == 5 || K == 7); + } + case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->src[2]->type == GGML_TYPE_F32; + case GGML_OP_SUPERTONIC_PW2_RESIDUAL: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->src[2]->type == GGML_TYPE_F32 && + op->src[3]->type == GGML_TYPE_F32; + case GGML_OP_SUPERTONIC_BIAS_GELU: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_SUPERTONIC_EDGE_PAD_1D: + return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_DIAG_MASK_INF: return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && diff --git a/src/ggml-metal/ggml-metal-impl.h b/src/ggml-metal/ggml-metal-impl.h index 384db806ef..56d9a4538e 100644 --- a/src/ggml-metal/ggml-metal-impl.h +++ b/src/ggml-metal/ggml-metal-impl.h @@ -1027,6 +1027,62 @@ typedef struct { int32_t p1; } ggml_metal_kargs_pad_reflect_1d; +typedef struct { + int32_t L; + int32_t C; + int32_t K; + int32_t dilation; + int32_t has_bias; + int32_t causal; // 0 = symmetric edge-clamp (vector_estimator), 1 = causal-left (vocoder) + int32_t sxt; + int32_t sxc; + int32_t syt; + int32_t syc; +} ggml_metal_kargs_supertonic_depthwise_1d; + +typedef struct { + int32_t L; + int32_t C; + float eps; + // Per-axis element strides for x and y. Lets the same kernel handle + // both [T, C] (sxt=1, sxc=L) and [C, T] (sxt=C, sxc=1) layouts. + int32_t sxt; // x stride per time step (in elements) + int32_t sxc; // x stride per channel (in elements) + int32_t syt; // y stride per time step (in elements) + int32_t syc; // y stride per channel (in elements) +} ggml_metal_kargs_supertonic_layer_norm_channel; + +typedef struct { + int32_t L; + int32_t C; + int32_t sxt; + int32_t sxc; + int32_t syt; + int32_t syc; + int32_t srt; + int32_t src; +} ggml_metal_kargs_supertonic_pw2_residual; + +typedef struct { + int32_t L; + int32_t C; + int32_t sxt; + int32_t sxc; + int32_t syt; + int32_t syc; +} ggml_metal_kargs_supertonic_bias_gelu; + +typedef struct { + int32_t L_in; + int32_t L_out; + int32_t C; + int32_t pad_left; + int32_t sxt; + int32_t sxc; + int32_t syt; + int32_t syc; +} ggml_metal_kargs_supertonic_edge_pad_1d; + typedef struct { uint64_t nb1; int dim; diff --git a/src/ggml-metal/ggml-metal-ops.cpp b/src/ggml-metal/ggml-metal-ops.cpp index fef6bbdffd..ddad31e120 100644 --- a/src/ggml-metal/ggml-metal-ops.cpp +++ b/src/ggml-metal/ggml-metal-ops.cpp @@ -411,6 +411,26 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_pad_reflect_1d(ctx, idx); } break; + case GGML_OP_SUPERTONIC_DEPTHWISE_1D: + { + n_fuse = ggml_metal_op_supertonic_depthwise_1d(ctx, idx); + } break; + case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL: + { + n_fuse = ggml_metal_op_supertonic_layer_norm_channel(ctx, idx); + } break; + case GGML_OP_SUPERTONIC_PW2_RESIDUAL: + { + n_fuse = ggml_metal_op_supertonic_pw2_residual(ctx, idx); + } break; + case GGML_OP_SUPERTONIC_BIAS_GELU: + { + n_fuse = ggml_metal_op_supertonic_bias_gelu(ctx, idx); + } break; + case GGML_OP_SUPERTONIC_EDGE_PAD_1D: + { + n_fuse = ggml_metal_op_supertonic_edge_pad_1d(ctx, idx); + } break; case GGML_OP_DIAG_MASK_INF: { n_fuse = ggml_metal_op_diag_mask_inf(ctx, idx); @@ -4183,6 +4203,285 @@ int ggml_metal_op_diag_mask_inf(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_supertonic_depthwise_1d(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS( int32_t, ne, op, ne); + + const int32_t * opts = (const int32_t *) op->op_params; + const int K = opts[0]; + const int dilation = opts[1]; + // opts[2]: layout flag (0 = [T, C] default, 1 = [C, T] for full B2). + // opts[3]: causal flag (0 = symmetric edge-clamp, 1 = causal-left pad). + const int32_t layout = opts[2]; + const int32_t causal = opts[3]; + + int L, C, sxt, sxc, syt, syc; + if (layout == 0) { + L = ne0; C = ne1; + sxt = 1; sxc = L; syt = 1; syc = L; + } else { + C = ne0; L = ne1; + sxt = C; sxc = 1; syt = C; syc = 1; + } + + ggml_metal_kargs_supertonic_depthwise_1d args = { + /*.L =*/ L, + /*.C =*/ C, + /*.K =*/ K, + /*.dilation =*/ dilation, + /*.has_bias =*/ (op->src[2] != nullptr) ? 1 : 0, + /*.causal =*/ causal, + /*.sxt =*/ sxt, + /*.sxc =*/ sxc, + /*.syt =*/ syt, + /*.syc =*/ syc, + }; + + auto pipeline = ggml_metal_library_get_pipeline_supertonic_depthwise_1d(lib, op); + + // Cap threads-per-threadgroup at min(L, 1024). One threadgroup per + // channel; threads stride over time. + int nth = L; + if (nth > 1024) nth = 1024; + if (nth < 1) nth = 1; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // w + if (op->src[2] != nullptr) { + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3); // bias + } else { + // Bind src[0] as a harmless placeholder; the kernel won't read it. + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3); + } + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4); // y + + ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_supertonic_layer_norm_channel(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne, op, ne); + + // op_params layout: [eps (f32), layout_flag (i32)]. layout_flag is + // 0 (default) for [T, C] activations (T inner-most, the historical + // contract), 1 for [C, T] (C inner-most — full B2 path). + float eps; + memcpy(&eps, op->op_params, sizeof(eps)); + const int32_t layout = ((const int32_t *) op->op_params)[1]; + + int L, C; + int sxt, sxc, syt, syc; + if (layout == 0) { + // [T, C]: ne0 = T, ne1 = C. T inner-most (stride 1), C outer (stride L). + L = ne0; + C = ne1; + sxt = 1; sxc = L; + syt = 1; syc = L; + } else { + // [C, T]: ne0 = C, ne1 = T. C inner-most (stride 1), T outer (stride C). + C = ne0; + L = ne1; + sxt = C; sxc = 1; + syt = C; syc = 1; + } + + ggml_metal_kargs_supertonic_layer_norm_channel args = { + /*.L =*/ L, + /*.C =*/ C, + /*.eps =*/ eps, + /*.sxt =*/ sxt, + /*.sxc =*/ sxc, + /*.syt =*/ syt, + /*.syc =*/ syc, + }; + + auto pipeline = ggml_metal_library_get_pipeline_supertonic_layer_norm_channel(lib, op); + + // Threads-per-threadgroup: round up to a multiple of 32 (Apple GPU + // simdgroup size). Cap at 256 to limit register pressure. + int nth = 32; + while (nth < C && nth < 256) nth *= 2; + if (nth > C) nth = ((C + 31) / 32) * 32; + if (nth > 256) nth = 256; + if (nth < 32) nth = 32; + + // shared scratch: one float per simdgroup, max 8 simdgroups (256/32). + const size_t shared_bytes = 8 * sizeof(float); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // g + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3); // b + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4); // y + ggml_metal_encoder_set_threadgroup_memory_size(enc, shared_bytes, 0); + + ggml_metal_encoder_dispatch_threadgroups(enc, L, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_supertonic_pw2_residual(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne, op, ne); + + // op_params[0]: layout flag. 0 = [T, C] default, 1 = [C, T] (full B2). + const int32_t layout = ((const int32_t *) op->op_params)[0]; + + int L, C, sxt, sxc, syt, syc, srt, src; + if (layout == 0) { + L = ne0; C = ne1; + sxt = 1; sxc = L; syt = 1; syc = L; srt = 1; src = L; + } else { + C = ne0; L = ne1; + sxt = C; sxc = 1; syt = C; syc = 1; srt = C; src = 1; + } + + ggml_metal_kargs_supertonic_pw2_residual args = { + /*.L =*/ L, + /*.C =*/ C, + /*.sxt =*/ sxt, + /*.sxc =*/ sxc, + /*.syt =*/ syt, + /*.syc =*/ syc, + /*.srt =*/ srt, + /*.src =*/ src, + }; + + auto pipeline = ggml_metal_library_get_pipeline_supertonic_pw2_residual(lib, op); + + int nth = L; + if (nth > 256) nth = 256; + if (nth < 1) nth = 1; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // bias + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3); // gamma + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), 4); // residual + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 5); // y + + ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_supertonic_bias_gelu(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne, op, ne); + + // op_params[0]: layout flag. 0 = [T, C] default, 1 = [C, T] (full B2). + const int32_t layout = ((const int32_t *) op->op_params)[0]; + + int L, C, sxt, sxc, syt, syc; + if (layout == 0) { + L = ne0; C = ne1; + sxt = 1; sxc = L; syt = 1; syc = L; + } else { + C = ne0; L = ne1; + sxt = C; sxc = 1; syt = C; syc = 1; + } + + ggml_metal_kargs_supertonic_bias_gelu args = { + /*.L =*/ L, + /*.C =*/ C, + /*.sxt =*/ sxt, + /*.sxc =*/ sxc, + /*.syt =*/ syt, + /*.syc =*/ syc, + }; + + auto pipeline = ggml_metal_library_get_pipeline_supertonic_bias_gelu(lib, op); + + int nth = L; + if (nth > 256) nth = 256; + if (nth < 1) nth = 1; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // bias + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); // y + + ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_supertonic_edge_pad_1d(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne, op, ne); + + const int pad_left = ((const int32_t *) op->op_params)[0]; + // opts[2]: layout flag (opts[1] = pad_right). 0 = [T, C], 1 = [C, T]. + const int32_t layout = ((const int32_t *) op->op_params)[2]; + + int L_in, L_out, C, sxt, sxc, syt, syc; + if (layout == 0) { + L_in = (int) op->src[0]->ne[0]; + C = (int) op->src[0]->ne[1]; + L_out = ne0; + sxt = 1; sxc = L_in; syt = 1; syc = L_out; + } else { + C = (int) op->src[0]->ne[0]; + L_in = (int) op->src[0]->ne[1]; + L_out = ne1; + sxt = C; sxc = 1; syt = C; syc = 1; + } + + ggml_metal_kargs_supertonic_edge_pad_1d args = { + /*.L_in =*/ L_in, + /*.L_out =*/ L_out, + /*.C =*/ C, + /*.pad_left =*/ pad_left, + /*.sxt =*/ sxt, + /*.sxc =*/ sxc, + /*.syt =*/ syt, + /*.syc =*/ syc, + }; + + auto pipeline = ggml_metal_library_get_pipeline_supertonic_edge_pad_1d(lib, op); + + int nth = L_out; + if (nth > 256) nth = 256; + if (nth < 1) nth = 1; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); // y + + ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1); + + return 1; +} + + int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/src/ggml-metal/ggml-metal-ops.h b/src/ggml-metal/ggml-metal-ops.h index a3aa05c1d9..d4bc1655eb 100644 --- a/src/ggml-metal/ggml-metal-ops.h +++ b/src/ggml-metal/ggml-metal-ops.h @@ -81,6 +81,11 @@ int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx); int ggml_metal_op_pad (ggml_metal_op_t ctx, int idx); int ggml_metal_op_pad_reflect_1d (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_supertonic_depthwise_1d(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_supertonic_layer_norm_channel(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_supertonic_pw2_residual(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_supertonic_bias_gelu(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_supertonic_edge_pad_1d(ggml_metal_op_t ctx, int idx); int ggml_metal_op_diag_mask_inf (ggml_metal_op_t ctx, int idx); int ggml_metal_op_arange (ggml_metal_op_t ctx, int idx); int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx); diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index 006e1b93a3..b61f1c1faf 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -5480,6 +5480,284 @@ kernel void kernel_pad_reflect_1d_f32( } } +// Supertonic fused depthwise-1D conv with edge-clamp padding + bias add. +// Replaces the edge_clamp_pad_1d + im2col + mul_mat + add sequence the +// stock depthwise_same_ggml graph fallback emits. One threadgroup per +// channel; threads iterate over the time dimension. +// +// Layout (all f32 contiguous): +// x: [L, C, 1, 1] memory offset (t, c) -> c*L + t +// w: [K, 1, C, 1] memory offset (k, 0, c, 0) -> c*K + k +// bias: [C] memory offset c -> c (omitted when has_bias = 0) +// y: [L, C, 1, 1] same layout as x +kernel void kernel_supertonic_depthwise_1d_f32( + constant ggml_metal_kargs_supertonic_depthwise_1d & args, + device const float * x, + device const float * w, + device const float * bias, + device float * y, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int c = (int) tgpig.x; + if (c >= args.C) return; + + const int L = args.L; + const int K = args.K; + const int dilation = args.dilation; + const int causal = args.causal; + const int sxt = args.sxt, sxc = args.sxc; + const int syt = args.syt, syc = args.syc; + + // Layout-agnostic per-channel base pointers: x[t, c] = x[t*sxt + c*sxc]. + // w is stored as [K, 1, C] f32 contiguous, so w_c = w + c*K stays. + device const float * x_c = x + (size_t) c * sxc; + device const float * w_c = w + (size_t) c * K; + device float * y_c = y + (size_t) c * syc; + + const float bias_v = (args.has_bias != 0) ? bias[c] : 0.0f; + + // The k-offset selects the kernel-centre vs causal-left convolution + // semantic: symmetric edge-clamp (causal=0) uses offset = -K/2 so the + // tap at k = K/2 lands at t; causal (causal=1) uses offset = -(K-1) so + // the last tap at k = K-1 lands at t and earlier taps look strictly + // left. + const int k_off = (causal != 0) ? -(K - 1) : -(K / 2); + + for (int t = (int) tpitg.x; t < L; t += (int) ntg.x) { + float sum = bias_v; + // Compile-time peeled inner loop for K in {3, 5, 7}. K=3/5 is the + // vector_estimator's symmetric ConvNeXt; K=7 is the vocoder's causal + // ConvNeXt. Right-clamp `s >= L` is required for the symmetric path + // only — in causal mode all taps satisfy s ≤ t < L by construction. + if (K == 7) { + int s0 = t + (0 + k_off)*dilation; if (s0 < 0) s0 = 0; else if (s0 >= L) s0 = L - 1; + int s1 = t + (1 + k_off)*dilation; if (s1 < 0) s1 = 0; else if (s1 >= L) s1 = L - 1; + int s2 = t + (2 + k_off)*dilation; if (s2 < 0) s2 = 0; else if (s2 >= L) s2 = L - 1; + int s3 = t + (3 + k_off)*dilation; if (s3 < 0) s3 = 0; else if (s3 >= L) s3 = L - 1; + int s4 = t + (4 + k_off)*dilation; if (s4 < 0) s4 = 0; else if (s4 >= L) s4 = L - 1; + int s5 = t + (5 + k_off)*dilation; if (s5 < 0) s5 = 0; else if (s5 >= L) s5 = L - 1; + int s6 = t + (6 + k_off)*dilation; if (s6 < 0) s6 = 0; else if (s6 >= L) s6 = L - 1; + sum += x_c[(size_t) s0 * sxt] * w_c[0] + + x_c[(size_t) s1 * sxt] * w_c[1] + + x_c[(size_t) s2 * sxt] * w_c[2] + + x_c[(size_t) s3 * sxt] * w_c[3] + + x_c[(size_t) s4 * sxt] * w_c[4] + + x_c[(size_t) s5 * sxt] * w_c[5] + + x_c[(size_t) s6 * sxt] * w_c[6]; + } else if (K == 5) { + int s0 = t + (0 + k_off)*dilation; if (s0 < 0) s0 = 0; else if (s0 >= L) s0 = L - 1; + int s1 = t + (1 + k_off)*dilation; if (s1 < 0) s1 = 0; else if (s1 >= L) s1 = L - 1; + int s2 = t + (2 + k_off)*dilation; if (s2 < 0) s2 = 0; else if (s2 >= L) s2 = L - 1; + int s3 = t + (3 + k_off)*dilation; if (s3 < 0) s3 = 0; else if (s3 >= L) s3 = L - 1; + int s4 = t + (4 + k_off)*dilation; if (s4 < 0) s4 = 0; else if (s4 >= L) s4 = L - 1; + sum += x_c[(size_t) s0 * sxt] * w_c[0] + + x_c[(size_t) s1 * sxt] * w_c[1] + + x_c[(size_t) s2 * sxt] * w_c[2] + + x_c[(size_t) s3 * sxt] * w_c[3] + + x_c[(size_t) s4 * sxt] * w_c[4]; + } else { // K == 3 + int s0 = t + (0 + k_off)*dilation; if (s0 < 0) s0 = 0; else if (s0 >= L) s0 = L - 1; + int s1 = t + (1 + k_off)*dilation; if (s1 < 0) s1 = 0; else if (s1 >= L) s1 = L - 1; + int s2 = t + (2 + k_off)*dilation; if (s2 < 0) s2 = 0; else if (s2 >= L) s2 = L - 1; + sum += x_c[(size_t) s0 * sxt] * w_c[0] + + x_c[(size_t) s1 * sxt] * w_c[1] + + x_c[(size_t) s2 * sxt] * w_c[2]; + } + y_c[(size_t) t * syt] = sum; + } +} + +// Supertonic fused channel-axis layer norm. Replaces the +// permute + cont + ggml_norm + mul + add + permute + cont chain that +// stock ggml_norm requires (since ggml_norm normalises along ne[0]). +// One threadgroup per timestep; threads stripe over channels. +// +// Layout: x and y are f32 contiguous [L, C, 1, 1] with element (t, c) at +// offset c*L + t (channel slow, time fast). Affine params g, b are f32 [C]. +// +// Reduction uses simdgroup_sum + threadgroup memory across simdgroups. +// Assumes threads_per_threadgroup is a multiple of 32 (simdgroup size on +// Apple GPUs). Caller picks nth = min(L, 32 * ceil(C / 32)) up to 256. +kernel void kernel_supertonic_layer_norm_channel_f32( + constant ggml_metal_kargs_supertonic_layer_norm_channel & args, + device const float * x, + device const float * g, + device const float * b, + device float * y, + threadgroup float * shared [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]], + uint sgitg [[simdgroup_index_in_threadgroup]], + uint tiisg [[thread_index_in_simdgroup]]) { + + const int t = (int) tgpig.x; + if (t >= args.L) return; + + const int C = args.C; + // Element strides — let the same kernel handle both + // [T, C] (sxt=1, sxc=L) and [C, T] (sxt=C, sxc=1) layouts. + const int sxt = args.sxt, sxc = args.sxc; + const int syt = args.syt, syc = args.syc; + + device const float * x_t = x + (size_t) t * sxt; + device float * y_t = y + (size_t) t * syt; + + // ---- Mean ---- + float my_sum = 0.0f; + for (int c = (int) tpitg.x; c < C; c += (int) ntg.x) { + my_sum += x_t[(size_t) c * sxc]; + } + // Simdgroup reduce within the simdgroup. + my_sum = simd_sum(my_sum); + if (tiisg == 0) { + shared[sgitg] = my_sum; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + // First simdgroup reduces partial sums. + if (sgitg == 0) { + const uint n_sg = (ntg.x + 31) / 32; + float total = (tiisg < n_sg) ? shared[tiisg] : 0.0f; + total = simd_sum(total); + if (tiisg == 0) shared[0] = total; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + const float mean = shared[0] / (float) C; + + // ---- Variance ---- + float my_sq = 0.0f; + for (int c = (int) tpitg.x; c < C; c += (int) ntg.x) { + const float d = x_t[(size_t) c * sxc] - mean; + my_sq += d * d; + } + my_sq = simd_sum(my_sq); + if (tiisg == 0) { + shared[sgitg] = my_sq; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (sgitg == 0) { + const uint n_sg = (ntg.x + 31) / 32; + float total = (tiisg < n_sg) ? shared[tiisg] : 0.0f; + total = simd_sum(total); + if (tiisg == 0) shared[0] = total; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + const float inv_std = rsqrt(shared[0] / (float) C + args.eps); + + // ---- Apply affine ---- + for (int c = (int) tpitg.x; c < C; c += (int) ntg.x) { + const float xv = x_t[(size_t) c * sxc]; + y_t[(size_t) c * syc] = (xv - mean) * inv_std * g[c] + b[c]; + } +} + +// Supertonic fused (x + bias) * gamma + residual. Replaces the +// ggml_add → ggml_mul → ggml_add chain at the end of every ConvNeXt +// block. One threadgroup per channel; threads stride over the time +// dim. bias and gamma are constants for the channel — load once into +// registers and broadcast across timesteps. +// +// Layout: x, residual, y are f32 contiguous [L, C, 1, 1] with element +// (t, c) at offset c*L + t. bias and gamma are f32 [C]. +kernel void kernel_supertonic_pw2_residual_f32( + constant ggml_metal_kargs_supertonic_pw2_residual & args, + device const float * x, + device const float * bias, + device const float * gamma, + device const float * residual, + device float * y, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int c = (int) tgpig.x; + if (c >= args.C) return; + + const int L = args.L; + const int sxt = args.sxt, sxc = args.sxc; + const int syt = args.syt, syc = args.syc; + const int srt = args.srt, src = args.src; + const float bv = bias[c]; + const float gv = gamma[c]; + + device const float * x_c = x + (size_t) c * sxc; + device const float * r_c = residual + (size_t) c * src; + device float * y_c = y + (size_t) c * syc; + + for (int t = (int) tpitg.x; t < L; t += (int) ntg.x) { + y_c[(size_t) t * syt] = r_c[(size_t) t * srt] + (x_c[(size_t) t * sxt] + bv) * gv; + } +} + +// Fused bias + GELU (erf form): y[t, c] = gelu_erf(x[t, c] + bias[c]) +// = 0.5 * v * (1 + erf(v / sqrt(2))) +// Same channel-major contiguous layout as pw2_residual. One threadgroup +// per channel; threads in the threadgroup stride over the time axis so +// the per-channel bias load lives in a register for the whole row. +kernel void kernel_supertonic_bias_gelu_f32( + constant ggml_metal_kargs_supertonic_bias_gelu & args, + device const float * x, + device const float * bias, + device float * y, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int c = (int) tgpig.x; + if (c >= args.C) return; + + const int L = args.L; + const int sxt = args.sxt, sxc = args.sxc; + const int syt = args.syt, syc = args.syc; + const float bv = bias[c]; + + device const float * x_c = x + (size_t) c * sxc; + device float * y_c = y + (size_t) c * syc; + + // Use the same erf_approx template as kernel_gelu_erf_f32 above + // (Abramowitz & Stegun 7.1.26 / Hastings approximation) so the fused + // bias_gelu output is bit-identical to the unfused add + gelu_erf path. + for (int t = (int) tpitg.x; t < L; t += (int) ntg.x) { + const float v = x_c[(size_t) t * sxt] + bv; + y_c[(size_t) t * syt] = 0.5f * v * (1.0f + erf_approx(v * SQRT_2_INV)); + } +} + +// Replicate-pad on the time dimension (ne[0]) of a [L_in, C, 1, 1] f32 +// tensor. Output is [L_in + pad_left + pad_right, C, 1, 1]; for each +// output time `t`, we copy from `x[clamp(t - pad_left, 0, L_in - 1), c]`. +// One threadgroup per channel; threads in the threadgroup stride over +// the (longer) output time axis. +kernel void kernel_supertonic_edge_pad_1d_f32( + constant ggml_metal_kargs_supertonic_edge_pad_1d & args, + device const float * x, + device float * y, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int c = (int) tgpig.x; + if (c >= args.C) return; + + const int L_in = args.L_in; + const int L_out = args.L_out; + const int pad = args.pad_left; + const int sxt = args.sxt, sxc = args.sxc; + const int syt = args.syt, syc = args.syc; + + device const float * x_c = x + (size_t) c * sxc; + device float * y_c = y + (size_t) c * syc; + + for (int t = (int) tpitg.x; t < L_out; t += (int) ntg.x) { + int src_t = t - pad; + if (src_t < 0) src_t = 0; + if (src_t >= L_in) src_t = L_in - 1; + y_c[(size_t) t * syt] = x_c[(size_t) src_t * sxt]; + } +} + kernel void kernel_arange_f32( constant ggml_metal_kargs_arange & args, device char * dst, diff --git a/src/ggml.c b/src/ggml.c index 0142498d96..5aabdcd016 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -1063,9 +1063,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_SGD", "GLU", + + "SUPERTONIC_DEPTHWISE_1D", + "SUPERTONIC_LAYER_NORM_CHANNEL", + "SUPERTONIC_PW2_RESIDUAL", + "SUPERTONIC_BIAS_GELU", + "SUPERTONIC_EDGE_PAD_1D", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1173,9 +1179,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "sgd(x)", "glu(x)", + + "supertonic_depthwise_1d(x,w,b)", + "supertonic_layer_norm_channel(x,g,b)", + "supertonic_pw2_residual(x,b,gamma,res)", + "supertonic_bias_gelu(x,b)", + "supertonic_edge_pad_1d(x,pad_l,pad_r)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5123,6 +5135,295 @@ struct ggml_tensor * ggml_pad_reflect_1d( return result; } +// ggml_supertonic_depthwise_1d + +// Internal helper: depthwise_1d with explicit layout + causal flags. +// layout = 0 → [T, C] (T inner) ; layout = 1 → [C, T] (C inner). +// causal = 0 → symmetric edge-clamp (vector_estimator); +// causal = 1 → causal-left padding (vocoder). +static struct ggml_tensor * ggml_supertonic_depthwise_1d_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation, + int layout, + int causal) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(w->type == GGML_TYPE_F32); + GGML_ASSERT(bias == NULL || bias->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(w)); + GGML_ASSERT(bias == NULL || ggml_is_contiguous(bias)); + GGML_ASSERT(dilation >= 1); + + // a: [L, C, 1, 1] (layout=0) or [C, L, 1, 1] (layout=1). + // w in conv kernel layout [K, 1, C, 1] (ggml_im2col-consumable), + // K in {3, 5, 7} (3/5 for vector_estimator symmetric, 7 for vocoder causal). + // bias: [C]. + const int K = (int) w->ne[0]; + GGML_ASSERT(K == 3 || K == 5 || K == 7); + GGML_ASSERT(w->ne[1] == 1); + const int64_t C_dim = (layout == 0) ? a->ne[1] : a->ne[0]; + GGML_ASSERT(w->ne[2] == C_dim); + GGML_ASSERT(bias == NULL || bias->ne[0] == C_dim); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + + int32_t params[4] = { K, dilation, layout, causal }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_SUPERTONIC_DEPTHWISE_1D; + result->src[0] = a; + result->src[1] = w; + result->src[2] = bias; + + return result; +} + +struct ggml_tensor * ggml_supertonic_depthwise_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation) { + return ggml_supertonic_depthwise_1d_impl(ctx, a, w, bias, dilation, + /*layout=*/0, /*causal=*/0); +} + +struct ggml_tensor * ggml_supertonic_depthwise_1d_ct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation) { + return ggml_supertonic_depthwise_1d_impl(ctx, a, w, bias, dilation, + /*layout=*/1, /*causal=*/0); +} + +struct ggml_tensor * ggml_supertonic_depthwise_1d_causal_ct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * w, + struct ggml_tensor * bias, + int dilation) { + return ggml_supertonic_depthwise_1d_impl(ctx, a, w, bias, dilation, + /*layout=*/1, /*causal=*/1); +} + +// ggml_supertonic_layer_norm_channel + +// Internal helper: layer_norm_channel with explicit layout flag. +// layout = 0 → [T, C] (T inner) ; layout = 1 → [C, T] (C inner). +static struct ggml_tensor * ggml_supertonic_layer_norm_channel_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * g, + struct ggml_tensor * b, + float eps, + int layout) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(g->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(g)); + GGML_ASSERT(ggml_is_contiguous(b)); + + GGML_ASSERT(a->ne[2] == 1 && a->ne[3] == 1); + // The "channel" dimension is the OUTER one for [T, C] (ne[1]) and the + // INNER one for [C, T] (ne[0]). gamma/beta have length == C. + const int64_t C_dim = (layout == 0) ? a->ne[1] : a->ne[0]; + GGML_ASSERT(g->ne[0] == C_dim); + GGML_ASSERT(b->ne[0] == C_dim); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + + // op_params: [eps (f32), layout (i32)]. + float params[2]; + memcpy(¶ms[0], &eps, sizeof(float)); + ((int32_t *) ¶ms[1])[0] = layout; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL; + result->src[0] = a; + result->src[1] = g; + result->src[2] = b; + + return result; +} + +struct ggml_tensor * ggml_supertonic_layer_norm_channel( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * g, + struct ggml_tensor * b, + float eps) { + return ggml_supertonic_layer_norm_channel_impl(ctx, a, g, b, eps, /*layout=*/0); +} + +struct ggml_tensor * ggml_supertonic_layer_norm_channel_ct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * g, + struct ggml_tensor * b, + float eps) { + return ggml_supertonic_layer_norm_channel_impl(ctx, a, g, b, eps, /*layout=*/1); +} + +// ggml_supertonic_pw2_residual + +static struct ggml_tensor * ggml_supertonic_pw2_residual_impl( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias, + struct ggml_tensor * gamma, + struct ggml_tensor * residual, + int layout) { + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(bias->type == GGML_TYPE_F32); + GGML_ASSERT(gamma->type == GGML_TYPE_F32); + GGML_ASSERT(residual->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(x)); + GGML_ASSERT(ggml_is_contiguous(bias)); + GGML_ASSERT(ggml_is_contiguous(gamma)); + GGML_ASSERT(ggml_is_contiguous(residual)); + GGML_ASSERT(x->ne[2] == 1 && x->ne[3] == 1); + GGML_ASSERT(residual->ne[0] == x->ne[0]); + GGML_ASSERT(residual->ne[1] == x->ne[1]); + const int64_t C_dim = (layout == 0) ? x->ne[1] : x->ne[0]; + GGML_ASSERT(bias->ne[0] == C_dim); + GGML_ASSERT(gamma->ne[0] == C_dim); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, x->type, + x->ne[0], x->ne[1], x->ne[2], x->ne[3]); + + int32_t params[1] = { layout }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_SUPERTONIC_PW2_RESIDUAL; + result->src[0] = x; + result->src[1] = bias; + result->src[2] = gamma; + result->src[3] = residual; + + return result; +} + +struct ggml_tensor * ggml_supertonic_pw2_residual( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias, + struct ggml_tensor * gamma, + struct ggml_tensor * residual) { + return ggml_supertonic_pw2_residual_impl(ctx, x, bias, gamma, residual, /*layout=*/0); +} + +struct ggml_tensor * ggml_supertonic_pw2_residual_ct( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias, + struct ggml_tensor * gamma, + struct ggml_tensor * residual) { + return ggml_supertonic_pw2_residual_impl(ctx, x, bias, gamma, residual, /*layout=*/1); +} + +// ggml_supertonic_bias_gelu + +static struct ggml_tensor * ggml_supertonic_bias_gelu_impl( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias, + int layout) { + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(bias->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(x)); + GGML_ASSERT(ggml_is_contiguous(bias)); + GGML_ASSERT(x->ne[2] == 1 && x->ne[3] == 1); + const int64_t C_dim = (layout == 0) ? x->ne[1] : x->ne[0]; + GGML_ASSERT(bias->ne[0] == C_dim); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, x->type, + x->ne[0], x->ne[1], x->ne[2], x->ne[3]); + + int32_t params[1] = { layout }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_SUPERTONIC_BIAS_GELU; + result->src[0] = x; + result->src[1] = bias; + + return result; +} + +struct ggml_tensor * ggml_supertonic_bias_gelu( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias) { + return ggml_supertonic_bias_gelu_impl(ctx, x, bias, /*layout=*/0); +} + +struct ggml_tensor * ggml_supertonic_bias_gelu_ct( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * bias) { + return ggml_supertonic_bias_gelu_impl(ctx, x, bias, /*layout=*/1); +} + +// ggml_supertonic_edge_pad_1d + +static struct ggml_tensor * ggml_supertonic_edge_pad_1d_impl( + struct ggml_context * ctx, + struct ggml_tensor * x, + int pad_left, + int pad_right, + int layout) { + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(x)); + GGML_ASSERT(x->ne[2] == 1 && x->ne[3] == 1); + GGML_ASSERT(pad_left >= 0 && pad_right >= 0); + + int64_t out_ne0, out_ne1; + if (layout == 0) { + const int64_t L_in = x->ne[0]; + const int64_t C = x->ne[1]; + out_ne0 = L_in + pad_left + pad_right; + out_ne1 = C; + } else { + const int64_t C = x->ne[0]; + const int64_t L_in = x->ne[1]; + out_ne0 = C; + out_ne1 = L_in + pad_left + pad_right; + } + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, x->type, + out_ne0, out_ne1, x->ne[2], x->ne[3]); + + int32_t params[3] = { pad_left, pad_right, layout }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_SUPERTONIC_EDGE_PAD_1D; + result->src[0] = x; + + return result; +} + +struct ggml_tensor * ggml_supertonic_edge_pad_1d( + struct ggml_context * ctx, + struct ggml_tensor * x, + int pad_left, + int pad_right) { + return ggml_supertonic_edge_pad_1d_impl(ctx, x, pad_left, pad_right, /*layout=*/0); +} + +struct ggml_tensor * ggml_supertonic_edge_pad_1d_ct( + struct ggml_context * ctx, + struct ggml_tensor * x, + int pad_left, + int pad_right) { + return ggml_supertonic_edge_pad_1d_impl(ctx, x, pad_left, pad_right, /*layout=*/1); +} + // ggml_roll struct ggml_tensor * ggml_roll(