diff --git a/include/ggml.h b/include/ggml.h
index 11d3e8a816..75710dbe39 100644
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -577,6 +577,17 @@ extern "C" {
 
         GGML_OP_GLU,
 
+        // Supertonic-specific fused ops (QVAC overlay).  These collapse
+        // multi-op sub-graphs that the Supertonic ggml port emits per
+        // ConvNeXt block / attention block, reducing per-step Metal
+        // command-buffer encode overhead.  See
+        // tts-cpp/cmake/vcpkg-overlay-ports/ggml/ggml-supertonic-ops.patch.
+        GGML_OP_SUPERTONIC_DEPTHWISE_1D,
+        GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL,
+        GGML_OP_SUPERTONIC_PW2_RESIDUAL,
+        GGML_OP_SUPERTONIC_BIAS_GELU,
+        GGML_OP_SUPERTONIC_EDGE_PAD_1D,
+
         GGML_OP_COUNT,
     };
 
@@ -2252,6 +2263,138 @@ extern "C" {
             int                   p0,
             int                   p1);
 
+    // Supertonic fused depthwise 1D convolution with edge-clamp (replicate)
+    // padding and bias add.  Per-channel filter of width K applied to the
+    // time dim of a (ne0 = L, ne1 = C, ne2 = 1, ne3 = 1).
+    //   y[t, c] = bias[c]
+    //           + sum_{k=0..K-1} a[clamp(t + (k - K/2)*dilation, 0, L-1), c]
+    //                          * w[k, c]
+    //
+    // a:    [L, C]  f32 contiguous
+    // w:    [K, C]  f32 contiguous (or [K, 1, C, 1] from a depthwise conv weight)
+    // bias: [C]     f32 contiguous (may be NULL — pass GGML_NULL_TENSOR to skip)
+    // dilation: positive integer
+    //
+    // Output ne=[L, C] matching a's shape.  Currently supports K in {3, 5}.
+    GGML_API struct ggml_tensor * ggml_supertonic_depthwise_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * w,
+            struct ggml_tensor  * bias,
+            int                   dilation);
+
+    // [C, T]-layout variant: a is [C, L, 1, 1] with C inner-most.  Same kernel,
+    // strides flipped via a layout flag in op_params.  Full B2 path.
+    GGML_API struct ggml_tensor * ggml_supertonic_depthwise_1d_ct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * w,
+            struct ggml_tensor  * bias,
+            int                   dilation);
+
+    // [C, T]-layout + causal-left padding variant.  Used by the vocoder
+    // ConvNeXt chain.  K may be 3, 5, or 7.
+    GGML_API struct ggml_tensor * ggml_supertonic_depthwise_1d_causal_ct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * w,
+            struct ggml_tensor  * bias,
+            int                   dilation);
+
+    // Supertonic fused channel-axis layer norm.  Normalises across the
+    // channel dim (ne[1]) of a [L, C, 1, 1] tensor and applies an affine
+    // scale + shift, all in one Metal dispatch.  Replaces the
+    // permute + cont + ggml_norm + mul + add + permute + cont chain that
+    // stock ggml_norm requires (since it normalises along ne[0]).
+    //
+    //   y[t, c] = ((a[t, c] - mean_t) / sqrt(var_t + eps)) * g[c] + b[c]
+    //
+    // a: [L, C, 1, 1]  f32 contiguous
+    // g: [C]           f32 contiguous (scale)
+    // b: [C]           f32 contiguous (shift)
+    // eps: numerical epsilon (passed as float op param)
+    GGML_API struct ggml_tensor * ggml_supertonic_layer_norm_channel(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * g,
+            struct ggml_tensor  * b,
+            float                 eps);
+
+    // [C, T]-layout variant: a is [C, T, 1, 1] with C inner-most.  g and b
+    // still have length C (== a->ne[0] here, vs == a->ne[1] in the [T, C]
+    // variant).  Same op + Metal kernel under the hood — the kernel reads
+    // per-axis strides from kargs, so this variant just sets a layout
+    // flag so the dispatch passes the right strides.  Phase B2 path.
+    GGML_API struct ggml_tensor * ggml_supertonic_layer_norm_channel_ct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * g,
+            struct ggml_tensor  * b,
+            float                 eps);
+
+    // Supertonic fused (x + bias) * gamma + residual.  Channel-axis
+    // broadcasts for `bias` and `gamma` (both [C]); `x` and `residual`
+    // are [L, C, 1, 1] f32 contiguous.  Output ne matches `x`.
+    //
+    //   y[t, c] = residual[t, c] + (x[t, c] + bias[c]) * gamma[c]
+    GGML_API struct ggml_tensor * ggml_supertonic_pw2_residual(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * bias,
+            struct ggml_tensor  * gamma,
+            struct ggml_tensor  * residual);
+
+    // [C, T]-layout variant.  Bias/gamma still have length C (== x->ne[0]
+    // here, vs == x->ne[1] in the [T, C] variant).  Full B2 path.
+    GGML_API struct ggml_tensor * ggml_supertonic_pw2_residual_ct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * bias,
+            struct ggml_tensor  * gamma,
+            struct ggml_tensor  * residual);
+
+    // Supertonic fused bias-add + GELU (erf form, the gelu_erf in ggml).
+    // Channel-axis broadcasts for `bias` ([C]); `x` is [L, C, 1, 1] f32
+    // contiguous.  Output ne matches `x`.
+    //
+    //   y[t, c] = gelu_erf(x[t, c] + bias[c])
+    //           = 0.5 * v * (1 + erf(v * 1/sqrt(2)))  where v = x + bias
+    GGML_API struct ggml_tensor * ggml_supertonic_bias_gelu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * bias);
+
+    // [C, T]-layout variant.  Bias still has length C (== x->ne[0] here).
+    GGML_API struct ggml_tensor * ggml_supertonic_bias_gelu_ct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * bias);
+
+    // Supertonic edge-replicate padding for 1D conv inputs.  Combines
+    // the view + repeat_4d + concat sequence used by vocoder's causal
+    // padding (pad_left only) and vector_estimator / text_encoder's
+    // symmetric edge_clamp padding (pad_left + pad_right) into one
+    // dispatch.  Input `x` is `[L_in, C, 1, 1]` f32 contiguous; the
+    // output has `ne = [L_in + pad_left + pad_right, C, 1, 1]` where:
+    //
+    //   y[t, c] = x[clamp(t - pad_left, 0, L_in - 1), c]
+    //
+    // (Replicate / edge-clamp semantics — the leftmost row of `x` fills
+    // the left pad, the rightmost row fills the right pad.)
+    GGML_API struct ggml_tensor * ggml_supertonic_edge_pad_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            int                   pad_left,
+            int                   pad_right);
+
+    // [C, T]-layout variant.  Input x is [C, L_in, 1, 1]; output is
+    // [C, L_in + pad_left + pad_right, 1, 1].  Full B2 path.
+    GGML_API struct ggml_tensor * ggml_supertonic_edge_pad_1d_ct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            int                   pad_left,
+            int                   pad_right);
+
     // Move tensor elements by an offset given for each dimension. Elements that
     // are shifted beyond the last position are wrapped around to the beginning.
     GGML_API struct ggml_tensor * ggml_roll(
diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c
index 2b3eb5b5ce..1a6b14297d 100644
--- a/src/ggml-cpu/ggml-cpu.c
+++ b/src/ggml-cpu/ggml-cpu.c
@@ -1942,6 +1942,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
+        case GGML_OP_SUPERTONIC_DEPTHWISE_1D:
+            {
+                ggml_compute_forward_supertonic_depthwise_1d(params, tensor);
+            } break;
+        case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL:
+            {
+                ggml_compute_forward_supertonic_layer_norm_channel(params, tensor);
+            } break;
+        case GGML_OP_SUPERTONIC_PW2_RESIDUAL:
+            {
+                ggml_compute_forward_supertonic_pw2_residual(params, tensor);
+            } break;
+        case GGML_OP_SUPERTONIC_BIAS_GELU:
+            {
+                ggml_compute_forward_supertonic_bias_gelu(params, tensor);
+            } break;
+        case GGML_OP_SUPERTONIC_EDGE_PAD_1D:
+            {
+                ggml_compute_forward_supertonic_edge_pad_1d(params, tensor);
+            } break;
         case GGML_OP_ROLL:
             {
                 ggml_compute_forward_roll(params, tensor);
@@ -2347,6 +2367,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_SUPERTONIC_DEPTHWISE_1D:
+        case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL:
+        case GGML_OP_SUPERTONIC_PW2_RESIDUAL:
+        case GGML_OP_SUPERTONIC_BIAS_GELU:
+        case GGML_OP_SUPERTONIC_EDGE_PAD_1D:
         case GGML_OP_ROLL:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp
index 0b5d6c6df8..332f9dd3c9 100644
--- a/src/ggml-cpu/ops.cpp
+++ b/src/ggml-cpu/ops.cpp
@@ -7863,6 +7863,274 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
+// ggml_compute_forward_supertonic_depthwise_1d
+
+void ggml_compute_forward_supertonic_depthwise_1d(
+        const ggml_compute_params * params,
+              ggml_tensor * dst) {
+
+    const ggml_tensor * x    = dst->src[0]; // [L, C, 1, 1]
+    const ggml_tensor * w    = dst->src[1]; // [K, 1, C, 1]
+    const ggml_tensor * bias = dst->src[2]; // [C] or NULL
+
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(w->type == GGML_TYPE_F32);
+    GGML_ASSERT(bias == NULL || bias->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int32_t * opts = (const int32_t *) dst->op_params;
+    const int K        = opts[0];
+    const int dilation = opts[1];
+    const int32_t layout = opts[2];
+    const int32_t causal = opts[3];
+    const int k_off = (causal != 0) ? -(K - 1) : -(K / 2);
+
+    int L, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        L = (int) x->ne[0];
+        C = (int) x->ne[1];
+        sxt = 1;  sxc = L;
+        syt = 1;  syc = L;
+    } else {
+        C = (int) x->ne[0];
+        L = (int) x->ne[1];
+        sxt = C;  sxc = 1;
+        syt = C;  syc = 1;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float * x_data = (const float *) x->data;
+    const float * w_data = (const float *) w->data;
+    const float * b_data = bias ? (const float *) bias->data : NULL;
+          float * y_data = (float *) dst->data;
+
+    for (int c = ith; c < C; c += nth) {
+        const float bias_v = b_data ? b_data[c] : 0.0f;
+        const float * w_c = w_data + (size_t) c * K;
+        for (int t = 0; t < L; ++t) {
+            float sum = bias_v;
+            for (int k = 0; k < K; ++k) {
+                int s = t + (k + k_off) * dilation;
+                if (s < 0) s = 0;
+                else if (s >= L) s = L - 1;
+                sum += x_data[(size_t) s * sxt + (size_t) c * sxc] * w_c[k];
+            }
+            y_data[(size_t) t * syt + (size_t) c * syc] = sum;
+        }
+    }
+}
+
+// ggml_compute_forward_supertonic_layer_norm_channel
+
+void ggml_compute_forward_supertonic_layer_norm_channel(
+        const ggml_compute_params * params,
+              ggml_tensor * dst) {
+
+    const ggml_tensor * x = dst->src[0]; // [L, C, 1, 1]
+    const ggml_tensor * g = dst->src[1]; // [C]
+    const ggml_tensor * b = dst->src[2]; // [C]
+
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(g->type == GGML_TYPE_F32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(eps));
+    const int32_t layout = ((const int32_t *) dst->op_params)[1];
+
+    int L, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        // [T, C]: T inner.
+        L = (int) x->ne[0];
+        C = (int) x->ne[1];
+        sxt = 1;  sxc = L;
+        syt = 1;  syc = L;
+    } else {
+        // [C, T]: C inner.
+        C = (int) x->ne[0];
+        L = (int) x->ne[1];
+        sxt = C;  sxc = 1;
+        syt = C;  syc = 1;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float * x_data = (const float *) x->data;
+    const float * g_data = (const float *) g->data;
+    const float * b_data = (const float *) b->data;
+          float * y_data = (float *) dst->data;
+
+    // Layout-agnostic indexing via element strides.
+    for (int t = ith; t < L; t += nth) {
+        double mean = 0.0;
+        for (int c = 0; c < C; ++c) mean += x_data[(size_t) t * sxt + (size_t) c * sxc];
+        mean /= (double) C;
+        double var = 0.0;
+        for (int c = 0; c < C; ++c) {
+            const double d = (double) x_data[(size_t) t * sxt + (size_t) c * sxc] - mean;
+            var += d * d;
+        }
+        const float inv = 1.0f / sqrtf((float) (var / (double) C) + eps);
+        for (int c = 0; c < C; ++c) {
+            const float xv = x_data[(size_t) t * sxt + (size_t) c * sxc];
+            y_data[(size_t) t * syt + (size_t) c * syc] = (xv - (float) mean) * inv * g_data[c] + b_data[c];
+        }
+    }
+}
+
+// ggml_compute_forward_supertonic_pw2_residual
+
+void ggml_compute_forward_supertonic_pw2_residual(
+        const ggml_compute_params * params,
+              ggml_tensor * dst) {
+
+    const ggml_tensor * x        = dst->src[0]; // [L, C, 1, 1]
+    const ggml_tensor * bias     = dst->src[1]; // [C]
+    const ggml_tensor * gamma    = dst->src[2]; // [C]
+    const ggml_tensor * residual = dst->src[3]; // [L, C, 1, 1]
+
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(bias->type == GGML_TYPE_F32);
+    GGML_ASSERT(gamma->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int32_t layout = ((const int32_t *) dst->op_params)[0];
+
+    int L, C, sxt, sxc, syt, syc, srt, src;
+    if (layout == 0) {
+        L = (int) x->ne[0];
+        C = (int) x->ne[1];
+        sxt = 1;  sxc = L;
+        syt = 1;  syc = L;
+        srt = 1;  src = L;
+    } else {
+        C = (int) x->ne[0];
+        L = (int) x->ne[1];
+        sxt = C;  sxc = 1;
+        syt = C;  syc = 1;
+        srt = C;  src = 1;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float * x_data   = (const float *) x->data;
+    const float * b_data   = (const float *) bias->data;
+    const float * g_data   = (const float *) gamma->data;
+    const float * r_data   = (const float *) residual->data;
+          float * y_data   = (float *) dst->data;
+
+    // Stripe over channels.  For each channel c, bias and gamma are read
+    // once and applied across all L timesteps; layout flag flips x/y/r index
+    // strides between [T, C] and [C, T].
+    for (int c = ith; c < C; c += nth) {
+        const float bv = b_data[c];
+        const float gv = g_data[c];
+        for (int t = 0; t < L; ++t) {
+            const float xv = x_data[(size_t) t * sxt + (size_t) c * sxc];
+            const float rv = r_data[(size_t) t * srt + (size_t) c * src];
+            y_data[(size_t) t * syt + (size_t) c * syc] = rv + (xv + bv) * gv;
+        }
+    }
+}
+
+// ggml_compute_forward_supertonic_bias_gelu
+
+void ggml_compute_forward_supertonic_bias_gelu(
+        const ggml_compute_params * params,
+              ggml_tensor * dst) {
+
+    const ggml_tensor * x    = dst->src[0]; // [L, C, 1, 1]
+    const ggml_tensor * bias = dst->src[1]; // [C]
+
+    GGML_ASSERT(x->type    == GGML_TYPE_F32);
+    GGML_ASSERT(bias->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    const int32_t layout = ((const int32_t *) dst->op_params)[0];
+
+    int L, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        L = (int) x->ne[0];
+        C = (int) x->ne[1];
+        sxt = 1;  sxc = L;
+        syt = 1;  syc = L;
+    } else {
+        C = (int) x->ne[0];
+        L = (int) x->ne[1];
+        sxt = C;  sxc = 1;
+        syt = C;  syc = 1;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float * x_data = (const float *) x->data;
+    const float * b_data = (const float *) bias->data;
+          float * y_data = (float *)       dst->data;
+
+    static const float inv_sqrt_2 = 0.7071067811865475f;
+
+    for (int c = ith; c < C; c += nth) {
+        const float bv = b_data[c];
+        for (int t = 0; t < L; ++t) {
+            const float v = x_data[(size_t) t * sxt + (size_t) c * sxc] + bv;
+            y_data[(size_t) t * syt + (size_t) c * syc] = 0.5f * v * (1.0f + erff(v * inv_sqrt_2));
+        }
+    }
+}
+
+// ggml_compute_forward_supertonic_edge_pad_1d
+
+void ggml_compute_forward_supertonic_edge_pad_1d(
+        const ggml_compute_params * params,
+              ggml_tensor * dst) {
+
+    const ggml_tensor * x = dst->src[0]; // [L_in, C, 1, 1] or [C, L_in, 1, 1]
+
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int pad_left = ggml_get_op_params_i32(dst, 0);
+    const int32_t layout = ggml_get_op_params_i32(dst, 2);
+
+    int L_in, L_out, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        L_in  = (int) x->ne[0];
+        C     = (int) x->ne[1];
+        L_out = (int) dst->ne[0];
+        sxt = 1;     sxc = L_in;   syt = 1;     syc = L_out;
+    } else {
+        C     = (int) x->ne[0];
+        L_in  = (int) x->ne[1];
+        L_out = (int) dst->ne[1];
+        sxt = C;     sxc = 1;      syt = C;     syc = 1;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const float * x_data = (const float *) x->data;
+          float * y_data = (float *)       dst->data;
+
+    // Stripe over channels.  For each output time t, read from
+    // clamp(t - pad_left, 0, L_in - 1); layout flag picks strides.
+    for (int c = ith; c < C; c += nth) {
+        for (int t = 0; t < L_out; ++t) {
+            int src_t = t - pad_left;
+            if (src_t < 0)       src_t = 0;
+            if (src_t >= L_in)   src_t = L_in - 1;
+            y_data[(size_t) t * syt + (size_t) c * syc] =
+                x_data[(size_t) src_t * sxt + (size_t) c * sxc];
+        }
+    }
+}
+
 // ggml_compute_forward_roll
 
 static int64_t ggml_wrap_index(int64_t i, int64_t ne) {
diff --git a/src/ggml-cpu/ops.h b/src/ggml-cpu/ops.h
index 3fa1443abc..40d7026f9a 100644
--- a/src/ggml-cpu/ops.h
+++ b/src/ggml-cpu/ops.h
@@ -77,6 +77,11 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_supertonic_depthwise_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_supertonic_layer_norm_channel(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_supertonic_pw2_residual(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_supertonic_bias_gelu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_supertonic_edge_pad_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/src/ggml-metal/ggml-metal-device.cpp b/src/ggml-metal/ggml-metal-device.cpp
index 5d9336314d..14a63e0643 100644
--- a/src/ggml-metal/ggml-metal-device.cpp
+++ b/src/ggml-metal/ggml-metal-device.cpp
@@ -1856,6 +1856,40 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d(g
     return res;
 }
 
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_depthwise_1d(ggml_metal_library_t lib, const ggml_tensor * op) {
+    assert(op->op == GGML_OP_SUPERTONIC_DEPTHWISE_1D);
+
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_supertonic_depthwise_1d_%s", ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_layer_norm_channel(ggml_metal_library_t lib, const ggml_tensor * op) {
+    assert(op->op == GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL);
+
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_supertonic_layer_norm_channel_%s", ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag_mask_inf(ggml_metal_library_t lib, const ggml_tensor * op) {
     assert(op->op == GGML_OP_DIAG_MASK_INF);
     GGML_UNUSED(op);
@@ -1870,6 +1904,57 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag_mask_inf(gg
     return res;
 }
 
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_pw2_residual(ggml_metal_library_t lib, const ggml_tensor * op) {
+    assert(op->op == GGML_OP_SUPERTONIC_PW2_RESIDUAL);
+
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_supertonic_pw2_residual_%s", ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_bias_gelu(ggml_metal_library_t lib, const ggml_tensor * op) {
+    assert(op->op == GGML_OP_SUPERTONIC_BIAS_GELU);
+
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_supertonic_bias_gelu_%s", ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_edge_pad_1d(ggml_metal_library_t lib, const ggml_tensor * op) {
+    assert(op->op == GGML_OP_SUPERTONIC_EDGE_PAD_1D);
+
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_supertonic_edge_pad_1d_%s", ggml_type_name(op->src[0]->type));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange(ggml_metal_library_t lib, const ggml_tensor * op) {
     assert(op->op == GGML_OP_ARANGE);
 
diff --git a/src/ggml-metal/ggml-metal-device.h b/src/ggml-metal/ggml-metal-device.h
index b6d6b3d517..8bf3603af6 100644
--- a/src/ggml-metal/ggml-metal-device.h
+++ b/src/ggml-metal/ggml-metal-device.h
@@ -152,6 +152,11 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale           (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d    (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_depthwise_1d(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_layer_norm_channel(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_pw2_residual(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_bias_gelu(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_supertonic_edge_pad_1d(ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag_mask_inf     (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
diff --git a/src/ggml-metal/ggml-metal-device.m b/src/ggml-metal/ggml-metal-device.m
index 089854e828..21e71c37e0 100644
--- a/src/ggml-metal/ggml-metal-device.m
+++ b/src/ggml-metal/ggml-metal-device.m
@@ -1134,6 +1134,28 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
             return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_SUPERTONIC_DEPTHWISE_1D:
+            {
+                const int K = ggml_get_op_params_i32(op, 0);
+                return op->src[0]->type == GGML_TYPE_F32 &&
+                       op->src[1]->type == GGML_TYPE_F32 &&
+                       (op->src[2] == NULL || op->src[2]->type == GGML_TYPE_F32) &&
+                       (K == 3 || K == 5 || K == 7);
+            }
+        case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32 &&
+                   op->src[2]->type == GGML_TYPE_F32;
+        case GGML_OP_SUPERTONIC_PW2_RESIDUAL:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32 &&
+                   op->src[2]->type == GGML_TYPE_F32 &&
+                   op->src[3]->type == GGML_TYPE_F32;
+        case GGML_OP_SUPERTONIC_BIAS_GELU:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32;
+        case GGML_OP_SUPERTONIC_EDGE_PAD_1D:
+            return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_DIAG_MASK_INF:
             return op->src[0]->type == GGML_TYPE_F32 &&
                    op->type == GGML_TYPE_F32 &&
diff --git a/src/ggml-metal/ggml-metal-impl.h b/src/ggml-metal/ggml-metal-impl.h
index 384db806ef..56d9a4538e 100644
--- a/src/ggml-metal/ggml-metal-impl.h
+++ b/src/ggml-metal/ggml-metal-impl.h
@@ -1027,6 +1027,62 @@ typedef struct {
     int32_t  p1;
 } ggml_metal_kargs_pad_reflect_1d;
 
+typedef struct {
+    int32_t L;
+    int32_t C;
+    int32_t K;
+    int32_t dilation;
+    int32_t has_bias;
+    int32_t causal;   // 0 = symmetric edge-clamp (vector_estimator), 1 = causal-left (vocoder)
+    int32_t sxt;
+    int32_t sxc;
+    int32_t syt;
+    int32_t syc;
+} ggml_metal_kargs_supertonic_depthwise_1d;
+
+typedef struct {
+    int32_t L;
+    int32_t C;
+    float   eps;
+    // Per-axis element strides for x and y.  Lets the same kernel handle
+    // both [T, C] (sxt=1, sxc=L) and [C, T] (sxt=C, sxc=1) layouts.
+    int32_t sxt;  // x stride per time step (in elements)
+    int32_t sxc;  // x stride per channel  (in elements)
+    int32_t syt;  // y stride per time step (in elements)
+    int32_t syc;  // y stride per channel  (in elements)
+} ggml_metal_kargs_supertonic_layer_norm_channel;
+
+typedef struct {
+    int32_t L;
+    int32_t C;
+    int32_t sxt;
+    int32_t sxc;
+    int32_t syt;
+    int32_t syc;
+    int32_t srt;
+    int32_t src;
+} ggml_metal_kargs_supertonic_pw2_residual;
+
+typedef struct {
+    int32_t L;
+    int32_t C;
+    int32_t sxt;
+    int32_t sxc;
+    int32_t syt;
+    int32_t syc;
+} ggml_metal_kargs_supertonic_bias_gelu;
+
+typedef struct {
+    int32_t L_in;
+    int32_t L_out;
+    int32_t C;
+    int32_t pad_left;
+    int32_t sxt;
+    int32_t sxc;
+    int32_t syt;
+    int32_t syc;
+} ggml_metal_kargs_supertonic_edge_pad_1d;
+
 typedef struct {
     uint64_t nb1;
     int      dim;
diff --git a/src/ggml-metal/ggml-metal-ops.cpp b/src/ggml-metal/ggml-metal-ops.cpp
index fef6bbdffd..ddad31e120 100644
--- a/src/ggml-metal/ggml-metal-ops.cpp
+++ b/src/ggml-metal/ggml-metal-ops.cpp
@@ -411,6 +411,26 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = ggml_metal_op_pad_reflect_1d(ctx, idx);
             } break;
+        case GGML_OP_SUPERTONIC_DEPTHWISE_1D:
+            {
+                n_fuse = ggml_metal_op_supertonic_depthwise_1d(ctx, idx);
+            } break;
+        case GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL:
+            {
+                n_fuse = ggml_metal_op_supertonic_layer_norm_channel(ctx, idx);
+            } break;
+        case GGML_OP_SUPERTONIC_PW2_RESIDUAL:
+            {
+                n_fuse = ggml_metal_op_supertonic_pw2_residual(ctx, idx);
+            } break;
+        case GGML_OP_SUPERTONIC_BIAS_GELU:
+            {
+                n_fuse = ggml_metal_op_supertonic_bias_gelu(ctx, idx);
+            } break;
+        case GGML_OP_SUPERTONIC_EDGE_PAD_1D:
+            {
+                n_fuse = ggml_metal_op_supertonic_edge_pad_1d(ctx, idx);
+            } break;
         case GGML_OP_DIAG_MASK_INF:
             {
                 n_fuse = ggml_metal_op_diag_mask_inf(ctx, idx);
@@ -4183,6 +4203,285 @@ int ggml_metal_op_diag_mask_inf(ggml_metal_op_t ctx, int idx) {
 
     return 1;
 }
+int ggml_metal_op_supertonic_depthwise_1d(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
+
+    const int32_t * opts = (const int32_t *) op->op_params;
+    const int K        = opts[0];
+    const int dilation = opts[1];
+    // opts[2]: layout flag (0 = [T, C] default, 1 = [C, T] for full B2).
+    // opts[3]: causal flag (0 = symmetric edge-clamp, 1 = causal-left pad).
+    const int32_t layout = opts[2];
+    const int32_t causal = opts[3];
+
+    int L, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        L = ne0; C = ne1;
+        sxt = 1; sxc = L; syt = 1; syc = L;
+    } else {
+        C = ne0; L = ne1;
+        sxt = C; sxc = 1; syt = C; syc = 1;
+    }
+
+    ggml_metal_kargs_supertonic_depthwise_1d args = {
+        /*.L        =*/ L,
+        /*.C        =*/ C,
+        /*.K        =*/ K,
+        /*.dilation =*/ dilation,
+        /*.has_bias =*/ (op->src[2] != nullptr) ? 1 : 0,
+        /*.causal   =*/ causal,
+        /*.sxt      =*/ sxt,
+        /*.sxc      =*/ sxc,
+        /*.syt      =*/ syt,
+        /*.syc      =*/ syc,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_supertonic_depthwise_1d(lib, op);
+
+    // Cap threads-per-threadgroup at min(L, 1024).  One threadgroup per
+    // channel; threads stride over time.
+    int nth = L;
+    if (nth > 1024) nth = 1024;
+    if (nth < 1)    nth = 1;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // w
+    if (op->src[2] != nullptr) {
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3); // bias
+    } else {
+        // Bind src[0] as a harmless placeholder; the kernel won't read it.
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3);
+    }
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         4); // y
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_supertonic_layer_norm_channel(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS(int32_t, ne, op, ne);
+
+    // op_params layout: [eps (f32), layout_flag (i32)].  layout_flag is
+    // 0 (default) for [T, C] activations (T inner-most, the historical
+    // contract), 1 for [C, T] (C inner-most — full B2 path).
+    float eps;
+    memcpy(&eps, op->op_params, sizeof(eps));
+    const int32_t layout = ((const int32_t *) op->op_params)[1];
+
+    int L, C;
+    int sxt, sxc, syt, syc;
+    if (layout == 0) {
+        // [T, C]: ne0 = T, ne1 = C.  T inner-most (stride 1), C outer (stride L).
+        L = ne0;
+        C = ne1;
+        sxt = 1;       sxc = L;
+        syt = 1;       syc = L;
+    } else {
+        // [C, T]: ne0 = C, ne1 = T.  C inner-most (stride 1), T outer (stride C).
+        C = ne0;
+        L = ne1;
+        sxt = C;       sxc = 1;
+        syt = C;       syc = 1;
+    }
+
+    ggml_metal_kargs_supertonic_layer_norm_channel args = {
+        /*.L   =*/ L,
+        /*.C   =*/ C,
+        /*.eps =*/ eps,
+        /*.sxt =*/ sxt,
+        /*.sxc =*/ sxc,
+        /*.syt =*/ syt,
+        /*.syc =*/ syc,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_supertonic_layer_norm_channel(lib, op);
+
+    // Threads-per-threadgroup: round up to a multiple of 32 (Apple GPU
+    // simdgroup size).  Cap at 256 to limit register pressure.
+    int nth = 32;
+    while (nth < C && nth < 256) nth *= 2;
+    if (nth > C) nth = ((C + 31) / 32) * 32;
+    if (nth > 256) nth = 256;
+    if (nth < 32) nth = 32;
+
+    // shared scratch: one float per simdgroup, max 8 simdgroups (256/32).
+    const size_t shared_bytes = 8 * sizeof(float);
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // g
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[2]), 3); // b
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         4); // y
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, shared_bytes, 0);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, L, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_supertonic_pw2_residual(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS(int32_t, ne, op, ne);
+
+    // op_params[0]: layout flag.  0 = [T, C] default, 1 = [C, T] (full B2).
+    const int32_t layout = ((const int32_t *) op->op_params)[0];
+
+    int L, C, sxt, sxc, syt, syc, srt, src;
+    if (layout == 0) {
+        L = ne0; C = ne1;
+        sxt = 1; sxc = L; syt = 1; syc = L; srt = 1; src = L;
+    } else {
+        C = ne0; L = ne1;
+        sxt = C; sxc = 1; syt = C; syc = 1; srt = C; src = 1;
+    }
+
+    ggml_metal_kargs_supertonic_pw2_residual args = {
+        /*.L   =*/ L,
+        /*.C   =*/ C,
+        /*.sxt =*/ sxt,
+        /*.sxc =*/ sxc,
+        /*.syt =*/ syt,
+        /*.syc =*/ syc,
+        /*.srt =*/ srt,
+        /*.src =*/ src,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_supertonic_pw2_residual(lib, op);
+
+    int nth = L;
+    if (nth > 256) nth = 256;
+    if (nth < 1)   nth = 1;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // bias
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[2]), 3); // gamma
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[3]), 4); // residual
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         5); // y
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_supertonic_bias_gelu(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS(int32_t, ne, op, ne);
+
+    // op_params[0]: layout flag.  0 = [T, C] default, 1 = [C, T] (full B2).
+    const int32_t layout = ((const int32_t *) op->op_params)[0];
+
+    int L, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        L = ne0; C = ne1;
+        sxt = 1; sxc = L; syt = 1; syc = L;
+    } else {
+        C = ne0; L = ne1;
+        sxt = C; sxc = 1; syt = C; syc = 1;
+    }
+
+    ggml_metal_kargs_supertonic_bias_gelu args = {
+        /*.L   =*/ L,
+        /*.C   =*/ C,
+        /*.sxt =*/ sxt,
+        /*.sxc =*/ sxc,
+        /*.syt =*/ syt,
+        /*.syc =*/ syc,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_supertonic_bias_gelu(lib, op);
+
+    int nth = L;
+    if (nth > 256) nth = 256;
+    if (nth < 1)   nth = 1;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2); // bias
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3); // y
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_supertonic_edge_pad_1d(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS(int32_t, ne, op, ne);
+
+    const int pad_left = ((const int32_t *) op->op_params)[0];
+    // opts[2]: layout flag (opts[1] = pad_right).  0 = [T, C], 1 = [C, T].
+    const int32_t layout = ((const int32_t *) op->op_params)[2];
+
+    int L_in, L_out, C, sxt, sxc, syt, syc;
+    if (layout == 0) {
+        L_in  = (int) op->src[0]->ne[0];
+        C     = (int) op->src[0]->ne[1];
+        L_out = ne0;
+        sxt = 1;     sxc = L_in;   syt = 1;     syc = L_out;
+    } else {
+        C     = (int) op->src[0]->ne[0];
+        L_in  = (int) op->src[0]->ne[1];
+        L_out = ne1;
+        sxt = C;     sxc = 1;      syt = C;     syc = 1;
+    }
+
+    ggml_metal_kargs_supertonic_edge_pad_1d args = {
+        /*.L_in     =*/ L_in,
+        /*.L_out    =*/ L_out,
+        /*.C        =*/ C,
+        /*.pad_left =*/ pad_left,
+        /*.sxt      =*/ sxt,
+        /*.sxc      =*/ sxc,
+        /*.syt      =*/ syt,
+        /*.syc      =*/ syc,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_supertonic_edge_pad_1d(lib, op);
+
+    int nth = L_out;
+    if (nth > 256) nth = 256;
+    if (nth < 1)   nth = 1;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1); // x
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2); // y
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, C, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+
 
 int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) {
     ggml_tensor * op = ctx->node(idx);
diff --git a/src/ggml-metal/ggml-metal-ops.h b/src/ggml-metal/ggml-metal-ops.h
index a3aa05c1d9..d4bc1655eb 100644
--- a/src/ggml-metal/ggml-metal-ops.h
+++ b/src/ggml-metal/ggml-metal-ops.h
@@ -81,6 +81,11 @@ int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_upscale           (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_pad               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_pad_reflect_1d    (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_supertonic_depthwise_1d(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_supertonic_layer_norm_channel(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_supertonic_pw2_residual(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_supertonic_bias_gelu(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_supertonic_edge_pad_1d(ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_diag_mask_inf     (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_arange            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal
index 006e1b93a3..b61f1c1faf 100644
--- a/src/ggml-metal/ggml-metal.metal
+++ b/src/ggml-metal/ggml-metal.metal
@@ -5480,6 +5480,284 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
+// Supertonic fused depthwise-1D conv with edge-clamp padding + bias add.
+// Replaces the edge_clamp_pad_1d + im2col + mul_mat + add sequence the
+// stock depthwise_same_ggml graph fallback emits.  One threadgroup per
+// channel; threads iterate over the time dimension.
+//
+// Layout (all f32 contiguous):
+//   x:    [L, C, 1, 1] memory offset (t, c) -> c*L + t
+//   w:    [K, 1, C, 1] memory offset (k, 0, c, 0) -> c*K + k
+//   bias: [C]          memory offset c -> c     (omitted when has_bias = 0)
+//   y:    [L, C, 1, 1] same layout as x
+kernel void kernel_supertonic_depthwise_1d_f32(
+    constant   ggml_metal_kargs_supertonic_depthwise_1d & args,
+    device  const float * x,
+    device  const float * w,
+    device  const float * bias,
+    device        float * y,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int c = (int) tgpig.x;
+    if (c >= args.C) return;
+
+    const int L        = args.L;
+    const int K        = args.K;
+    const int dilation = args.dilation;
+    const int causal   = args.causal;
+    const int sxt = args.sxt, sxc = args.sxc;
+    const int syt = args.syt, syc = args.syc;
+
+    // Layout-agnostic per-channel base pointers: x[t, c] = x[t*sxt + c*sxc].
+    // w is stored as [K, 1, C] f32 contiguous, so w_c = w + c*K stays.
+    device const float * x_c = x + (size_t) c * sxc;
+    device const float * w_c = w + (size_t) c * K;
+    device       float * y_c = y + (size_t) c * syc;
+
+    const float bias_v = (args.has_bias != 0) ? bias[c] : 0.0f;
+
+    // The k-offset selects the kernel-centre vs causal-left convolution
+    // semantic: symmetric edge-clamp (causal=0) uses offset = -K/2 so the
+    // tap at k = K/2 lands at t; causal (causal=1) uses offset = -(K-1) so
+    // the last tap at k = K-1 lands at t and earlier taps look strictly
+    // left.
+    const int k_off = (causal != 0) ? -(K - 1) : -(K / 2);
+
+    for (int t = (int) tpitg.x; t < L; t += (int) ntg.x) {
+        float sum = bias_v;
+        // Compile-time peeled inner loop for K in {3, 5, 7}.  K=3/5 is the
+        // vector_estimator's symmetric ConvNeXt; K=7 is the vocoder's causal
+        // ConvNeXt.  Right-clamp `s >= L` is required for the symmetric path
+        // only — in causal mode all taps satisfy s ≤ t < L by construction.
+        if (K == 7) {
+            int s0 = t + (0 + k_off)*dilation; if (s0 < 0) s0 = 0; else if (s0 >= L) s0 = L - 1;
+            int s1 = t + (1 + k_off)*dilation; if (s1 < 0) s1 = 0; else if (s1 >= L) s1 = L - 1;
+            int s2 = t + (2 + k_off)*dilation; if (s2 < 0) s2 = 0; else if (s2 >= L) s2 = L - 1;
+            int s3 = t + (3 + k_off)*dilation; if (s3 < 0) s3 = 0; else if (s3 >= L) s3 = L - 1;
+            int s4 = t + (4 + k_off)*dilation; if (s4 < 0) s4 = 0; else if (s4 >= L) s4 = L - 1;
+            int s5 = t + (5 + k_off)*dilation; if (s5 < 0) s5 = 0; else if (s5 >= L) s5 = L - 1;
+            int s6 = t + (6 + k_off)*dilation; if (s6 < 0) s6 = 0; else if (s6 >= L) s6 = L - 1;
+            sum += x_c[(size_t) s0 * sxt] * w_c[0]
+                 + x_c[(size_t) s1 * sxt] * w_c[1]
+                 + x_c[(size_t) s2 * sxt] * w_c[2]
+                 + x_c[(size_t) s3 * sxt] * w_c[3]
+                 + x_c[(size_t) s4 * sxt] * w_c[4]
+                 + x_c[(size_t) s5 * sxt] * w_c[5]
+                 + x_c[(size_t) s6 * sxt] * w_c[6];
+        } else if (K == 5) {
+            int s0 = t + (0 + k_off)*dilation; if (s0 < 0) s0 = 0; else if (s0 >= L) s0 = L - 1;
+            int s1 = t + (1 + k_off)*dilation; if (s1 < 0) s1 = 0; else if (s1 >= L) s1 = L - 1;
+            int s2 = t + (2 + k_off)*dilation; if (s2 < 0) s2 = 0; else if (s2 >= L) s2 = L - 1;
+            int s3 = t + (3 + k_off)*dilation; if (s3 < 0) s3 = 0; else if (s3 >= L) s3 = L - 1;
+            int s4 = t + (4 + k_off)*dilation; if (s4 < 0) s4 = 0; else if (s4 >= L) s4 = L - 1;
+            sum += x_c[(size_t) s0 * sxt] * w_c[0]
+                 + x_c[(size_t) s1 * sxt] * w_c[1]
+                 + x_c[(size_t) s2 * sxt] * w_c[2]
+                 + x_c[(size_t) s3 * sxt] * w_c[3]
+                 + x_c[(size_t) s4 * sxt] * w_c[4];
+        } else { // K == 3
+            int s0 = t + (0 + k_off)*dilation; if (s0 < 0) s0 = 0; else if (s0 >= L) s0 = L - 1;
+            int s1 = t + (1 + k_off)*dilation; if (s1 < 0) s1 = 0; else if (s1 >= L) s1 = L - 1;
+            int s2 = t + (2 + k_off)*dilation; if (s2 < 0) s2 = 0; else if (s2 >= L) s2 = L - 1;
+            sum += x_c[(size_t) s0 * sxt] * w_c[0]
+                 + x_c[(size_t) s1 * sxt] * w_c[1]
+                 + x_c[(size_t) s2 * sxt] * w_c[2];
+        }
+        y_c[(size_t) t * syt] = sum;
+    }
+}
+
+// Supertonic fused channel-axis layer norm.  Replaces the
+// permute + cont + ggml_norm + mul + add + permute + cont chain that
+// stock ggml_norm requires (since ggml_norm normalises along ne[0]).
+// One threadgroup per timestep; threads stripe over channels.
+//
+// Layout: x and y are f32 contiguous [L, C, 1, 1] with element (t, c) at
+// offset c*L + t (channel slow, time fast).  Affine params g, b are f32 [C].
+//
+// Reduction uses simdgroup_sum + threadgroup memory across simdgroups.
+// Assumes threads_per_threadgroup is a multiple of 32 (simdgroup size on
+// Apple GPUs).  Caller picks nth = min(L, 32 * ceil(C / 32)) up to 256.
+kernel void kernel_supertonic_layer_norm_channel_f32(
+    constant   ggml_metal_kargs_supertonic_layer_norm_channel & args,
+    device  const float * x,
+    device  const float * g,
+    device  const float * b,
+    device        float * y,
+    threadgroup    float * shared [[threadgroup(0)]],
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]],
+    uint  sgitg [[simdgroup_index_in_threadgroup]],
+    uint  tiisg [[thread_index_in_simdgroup]]) {
+
+    const int t = (int) tgpig.x;
+    if (t >= args.L) return;
+
+    const int C = args.C;
+    // Element strides — let the same kernel handle both
+    // [T, C] (sxt=1, sxc=L) and [C, T] (sxt=C, sxc=1) layouts.
+    const int sxt = args.sxt, sxc = args.sxc;
+    const int syt = args.syt, syc = args.syc;
+
+    device const float * x_t = x + (size_t) t * sxt;
+    device       float * y_t = y + (size_t) t * syt;
+
+    // ---- Mean ----
+    float my_sum = 0.0f;
+    for (int c = (int) tpitg.x; c < C; c += (int) ntg.x) {
+        my_sum += x_t[(size_t) c * sxc];
+    }
+    // Simdgroup reduce within the simdgroup.
+    my_sum = simd_sum(my_sum);
+    if (tiisg == 0) {
+        shared[sgitg] = my_sum;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // First simdgroup reduces partial sums.
+    if (sgitg == 0) {
+        const uint n_sg = (ntg.x + 31) / 32;
+        float total = (tiisg < n_sg) ? shared[tiisg] : 0.0f;
+        total = simd_sum(total);
+        if (tiisg == 0) shared[0] = total;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    const float mean = shared[0] / (float) C;
+
+    // ---- Variance ----
+    float my_sq = 0.0f;
+    for (int c = (int) tpitg.x; c < C; c += (int) ntg.x) {
+        const float d = x_t[(size_t) c * sxc] - mean;
+        my_sq += d * d;
+    }
+    my_sq = simd_sum(my_sq);
+    if (tiisg == 0) {
+        shared[sgitg] = my_sq;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (sgitg == 0) {
+        const uint n_sg = (ntg.x + 31) / 32;
+        float total = (tiisg < n_sg) ? shared[tiisg] : 0.0f;
+        total = simd_sum(total);
+        if (tiisg == 0) shared[0] = total;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    const float inv_std = rsqrt(shared[0] / (float) C + args.eps);
+
+    // ---- Apply affine ----
+    for (int c = (int) tpitg.x; c < C; c += (int) ntg.x) {
+        const float xv = x_t[(size_t) c * sxc];
+        y_t[(size_t) c * syc] = (xv - mean) * inv_std * g[c] + b[c];
+    }
+}
+
+// Supertonic fused (x + bias) * gamma + residual.  Replaces the
+// ggml_add → ggml_mul → ggml_add chain at the end of every ConvNeXt
+// block.  One threadgroup per channel; threads stride over the time
+// dim.  bias and gamma are constants for the channel — load once into
+// registers and broadcast across timesteps.
+//
+// Layout: x, residual, y are f32 contiguous [L, C, 1, 1] with element
+// (t, c) at offset c*L + t.  bias and gamma are f32 [C].
+kernel void kernel_supertonic_pw2_residual_f32(
+    constant   ggml_metal_kargs_supertonic_pw2_residual & args,
+    device  const float * x,
+    device  const float * bias,
+    device  const float * gamma,
+    device  const float * residual,
+    device        float * y,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int c = (int) tgpig.x;
+    if (c >= args.C) return;
+
+    const int L = args.L;
+    const int sxt = args.sxt, sxc = args.sxc;
+    const int syt = args.syt, syc = args.syc;
+    const int srt = args.srt, src = args.src;
+    const float bv = bias[c];
+    const float gv = gamma[c];
+
+    device const float * x_c = x        + (size_t) c * sxc;
+    device const float * r_c = residual + (size_t) c * src;
+    device       float * y_c = y        + (size_t) c * syc;
+
+    for (int t = (int) tpitg.x; t < L; t += (int) ntg.x) {
+        y_c[(size_t) t * syt] = r_c[(size_t) t * srt] + (x_c[(size_t) t * sxt] + bv) * gv;
+    }
+}
+
+// Fused bias + GELU (erf form):  y[t, c] = gelu_erf(x[t, c] + bias[c])
+//                                        = 0.5 * v * (1 + erf(v / sqrt(2)))
+// Same channel-major contiguous layout as pw2_residual.  One threadgroup
+// per channel; threads in the threadgroup stride over the time axis so
+// the per-channel bias load lives in a register for the whole row.
+kernel void kernel_supertonic_bias_gelu_f32(
+    constant   ggml_metal_kargs_supertonic_bias_gelu & args,
+    device  const float * x,
+    device  const float * bias,
+    device        float * y,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int c = (int) tgpig.x;
+    if (c >= args.C) return;
+
+    const int L = args.L;
+    const int sxt = args.sxt, sxc = args.sxc;
+    const int syt = args.syt, syc = args.syc;
+    const float bv = bias[c];
+
+    device const float * x_c = x + (size_t) c * sxc;
+    device       float * y_c = y + (size_t) c * syc;
+
+    // Use the same erf_approx template as kernel_gelu_erf_f32 above
+    // (Abramowitz & Stegun 7.1.26 / Hastings approximation) so the fused
+    // bias_gelu output is bit-identical to the unfused add + gelu_erf path.
+    for (int t = (int) tpitg.x; t < L; t += (int) ntg.x) {
+        const float v = x_c[(size_t) t * sxt] + bv;
+        y_c[(size_t) t * syt] = 0.5f * v * (1.0f + erf_approx<float>(v * SQRT_2_INV));
+    }
+}
+
+// Replicate-pad on the time dimension (ne[0]) of a [L_in, C, 1, 1] f32
+// tensor.  Output is [L_in + pad_left + pad_right, C, 1, 1]; for each
+// output time `t`, we copy from `x[clamp(t - pad_left, 0, L_in - 1), c]`.
+// One threadgroup per channel; threads in the threadgroup stride over
+// the (longer) output time axis.
+kernel void kernel_supertonic_edge_pad_1d_f32(
+    constant   ggml_metal_kargs_supertonic_edge_pad_1d & args,
+    device  const float * x,
+    device        float * y,
+    uint3 tgpig[[threadgroup_position_in_grid]],
+    uint3 tpitg[[thread_position_in_threadgroup]],
+    uint3   ntg[[threads_per_threadgroup]]) {
+
+    const int c = (int) tgpig.x;
+    if (c >= args.C) return;
+
+    const int L_in  = args.L_in;
+    const int L_out = args.L_out;
+    const int pad   = args.pad_left;
+    const int sxt = args.sxt, sxc = args.sxc;
+    const int syt = args.syt, syc = args.syc;
+
+    device const float * x_c = x + (size_t) c * sxc;
+    device       float * y_c = y + (size_t) c * syc;
+
+    for (int t = (int) tpitg.x; t < L_out; t += (int) ntg.x) {
+        int src_t = t - pad;
+        if (src_t < 0)      src_t = 0;
+        if (src_t >= L_in)  src_t = L_in - 1;
+        y_c[(size_t) t * syt] = x_c[(size_t) src_t * sxt];
+    }
+}
+
 kernel void kernel_arange_f32(
     constant   ggml_metal_kargs_arange & args,
     device        char * dst,
diff --git a/src/ggml.c b/src/ggml.c
index 0142498d96..5aabdcd016 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -1063,9 +1063,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_SGD",
 
     "GLU",
+
+    "SUPERTONIC_DEPTHWISE_1D",
+    "SUPERTONIC_LAYER_NORM_CHANNEL",
+    "SUPERTONIC_PW2_RESIDUAL",
+    "SUPERTONIC_BIAS_GELU",
+    "SUPERTONIC_EDGE_PAD_1D",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1173,9 +1179,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "sgd(x)",
 
     "glu(x)",
+
+    "supertonic_depthwise_1d(x,w,b)",
+    "supertonic_layer_norm_channel(x,g,b)",
+    "supertonic_pw2_residual(x,b,gamma,res)",
+    "supertonic_bias_gelu(x,b)",
+    "supertonic_edge_pad_1d(x,pad_l,pad_r)",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -5123,6 +5135,295 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
+// ggml_supertonic_depthwise_1d
+
+// Internal helper: depthwise_1d with explicit layout + causal flags.
+// layout = 0 → [T, C] (T inner) ; layout = 1 → [C, T] (C inner).
+// causal = 0 → symmetric edge-clamp (vector_estimator);
+// causal = 1 → causal-left padding (vocoder).
+static struct ggml_tensor * ggml_supertonic_depthwise_1d_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * w,
+        struct ggml_tensor  * bias,
+        int                   dilation,
+        int                   layout,
+        int                   causal) {
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(w->type == GGML_TYPE_F32);
+    GGML_ASSERT(bias == NULL || bias->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_is_contiguous(w));
+    GGML_ASSERT(bias == NULL || ggml_is_contiguous(bias));
+    GGML_ASSERT(dilation >= 1);
+
+    // a: [L, C, 1, 1] (layout=0) or [C, L, 1, 1] (layout=1).
+    // w in conv kernel layout [K, 1, C, 1] (ggml_im2col-consumable),
+    // K in {3, 5, 7} (3/5 for vector_estimator symmetric, 7 for vocoder causal).
+    // bias: [C].
+    const int K = (int) w->ne[0];
+    GGML_ASSERT(K == 3 || K == 5 || K == 7);
+    GGML_ASSERT(w->ne[1] == 1);
+    const int64_t C_dim = (layout == 0) ? a->ne[1] : a->ne[0];
+    GGML_ASSERT(w->ne[2] == C_dim);
+    GGML_ASSERT(bias == NULL || bias->ne[0] == C_dim);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
+
+    int32_t params[4] = { K, dilation, layout, causal };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SUPERTONIC_DEPTHWISE_1D;
+    result->src[0] = a;
+    result->src[1] = w;
+    result->src[2] = bias;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_supertonic_depthwise_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * w,
+        struct ggml_tensor  * bias,
+        int                   dilation) {
+    return ggml_supertonic_depthwise_1d_impl(ctx, a, w, bias, dilation,
+                                             /*layout=*/0, /*causal=*/0);
+}
+
+struct ggml_tensor * ggml_supertonic_depthwise_1d_ct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * w,
+        struct ggml_tensor  * bias,
+        int                   dilation) {
+    return ggml_supertonic_depthwise_1d_impl(ctx, a, w, bias, dilation,
+                                             /*layout=*/1, /*causal=*/0);
+}
+
+struct ggml_tensor * ggml_supertonic_depthwise_1d_causal_ct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * w,
+        struct ggml_tensor  * bias,
+        int                   dilation) {
+    return ggml_supertonic_depthwise_1d_impl(ctx, a, w, bias, dilation,
+                                             /*layout=*/1, /*causal=*/1);
+}
+
+// ggml_supertonic_layer_norm_channel
+
+// Internal helper: layer_norm_channel with explicit layout flag.
+// layout = 0 → [T, C] (T inner) ; layout = 1 → [C, T] (C inner).
+static struct ggml_tensor * ggml_supertonic_layer_norm_channel_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * b,
+        float                 eps,
+        int                   layout) {
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(g->type == GGML_TYPE_F32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_is_contiguous(g));
+    GGML_ASSERT(ggml_is_contiguous(b));
+
+    GGML_ASSERT(a->ne[2] == 1 && a->ne[3] == 1);
+    // The "channel" dimension is the OUTER one for [T, C] (ne[1]) and the
+    // INNER one for [C, T] (ne[0]).  gamma/beta have length == C.
+    const int64_t C_dim = (layout == 0) ? a->ne[1] : a->ne[0];
+    GGML_ASSERT(g->ne[0] == C_dim);
+    GGML_ASSERT(b->ne[0] == C_dim);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
+
+    // op_params: [eps (f32), layout (i32)].
+    float params[2];
+    memcpy(&params[0], &eps, sizeof(float));
+    ((int32_t *) &params[1])[0] = layout;
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SUPERTONIC_LAYER_NORM_CHANNEL;
+    result->src[0] = a;
+    result->src[1] = g;
+    result->src[2] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_supertonic_layer_norm_channel(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * b,
+        float                 eps) {
+    return ggml_supertonic_layer_norm_channel_impl(ctx, a, g, b, eps, /*layout=*/0);
+}
+
+struct ggml_tensor * ggml_supertonic_layer_norm_channel_ct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * g,
+        struct ggml_tensor  * b,
+        float                 eps) {
+    return ggml_supertonic_layer_norm_channel_impl(ctx, a, g, b, eps, /*layout=*/1);
+}
+
+// ggml_supertonic_pw2_residual
+
+static struct ggml_tensor * ggml_supertonic_pw2_residual_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * bias,
+        struct ggml_tensor  * gamma,
+        struct ggml_tensor  * residual,
+        int                   layout) {
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(bias->type == GGML_TYPE_F32);
+    GGML_ASSERT(gamma->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(x));
+    GGML_ASSERT(ggml_is_contiguous(bias));
+    GGML_ASSERT(ggml_is_contiguous(gamma));
+    GGML_ASSERT(ggml_is_contiguous(residual));
+    GGML_ASSERT(x->ne[2] == 1 && x->ne[3] == 1);
+    GGML_ASSERT(residual->ne[0] == x->ne[0]);
+    GGML_ASSERT(residual->ne[1] == x->ne[1]);
+    const int64_t C_dim = (layout == 0) ? x->ne[1] : x->ne[0];
+    GGML_ASSERT(bias->ne[0] == C_dim);
+    GGML_ASSERT(gamma->ne[0] == C_dim);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, x->type,
+            x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
+
+    int32_t params[1] = { layout };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SUPERTONIC_PW2_RESIDUAL;
+    result->src[0] = x;
+    result->src[1] = bias;
+    result->src[2] = gamma;
+    result->src[3] = residual;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_supertonic_pw2_residual(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * bias,
+        struct ggml_tensor  * gamma,
+        struct ggml_tensor  * residual) {
+    return ggml_supertonic_pw2_residual_impl(ctx, x, bias, gamma, residual, /*layout=*/0);
+}
+
+struct ggml_tensor * ggml_supertonic_pw2_residual_ct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * bias,
+        struct ggml_tensor  * gamma,
+        struct ggml_tensor  * residual) {
+    return ggml_supertonic_pw2_residual_impl(ctx, x, bias, gamma, residual, /*layout=*/1);
+}
+
+// ggml_supertonic_bias_gelu
+
+static struct ggml_tensor * ggml_supertonic_bias_gelu_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * bias,
+        int                   layout) {
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(bias->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(x));
+    GGML_ASSERT(ggml_is_contiguous(bias));
+    GGML_ASSERT(x->ne[2] == 1 && x->ne[3] == 1);
+    const int64_t C_dim = (layout == 0) ? x->ne[1] : x->ne[0];
+    GGML_ASSERT(bias->ne[0] == C_dim);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, x->type,
+            x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
+
+    int32_t params[1] = { layout };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SUPERTONIC_BIAS_GELU;
+    result->src[0] = x;
+    result->src[1] = bias;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_supertonic_bias_gelu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * bias) {
+    return ggml_supertonic_bias_gelu_impl(ctx, x, bias, /*layout=*/0);
+}
+
+struct ggml_tensor * ggml_supertonic_bias_gelu_ct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * bias) {
+    return ggml_supertonic_bias_gelu_impl(ctx, x, bias, /*layout=*/1);
+}
+
+// ggml_supertonic_edge_pad_1d
+
+static struct ggml_tensor * ggml_supertonic_edge_pad_1d_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        int                   pad_left,
+        int                   pad_right,
+        int                   layout) {
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(x));
+    GGML_ASSERT(x->ne[2] == 1 && x->ne[3] == 1);
+    GGML_ASSERT(pad_left >= 0 && pad_right >= 0);
+
+    int64_t out_ne0, out_ne1;
+    if (layout == 0) {
+        const int64_t L_in = x->ne[0];
+        const int64_t C    = x->ne[1];
+        out_ne0 = L_in + pad_left + pad_right;
+        out_ne1 = C;
+    } else {
+        const int64_t C    = x->ne[0];
+        const int64_t L_in = x->ne[1];
+        out_ne0 = C;
+        out_ne1 = L_in + pad_left + pad_right;
+    }
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, x->type,
+            out_ne0, out_ne1, x->ne[2], x->ne[3]);
+
+    int32_t params[3] = { pad_left, pad_right, layout };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_SUPERTONIC_EDGE_PAD_1D;
+    result->src[0] = x;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_supertonic_edge_pad_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        int                   pad_left,
+        int                   pad_right) {
+    return ggml_supertonic_edge_pad_1d_impl(ctx, x, pad_left, pad_right, /*layout=*/0);
+}
+
+struct ggml_tensor * ggml_supertonic_edge_pad_1d_ct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        int                   pad_left,
+        int                   pad_right) {
+    return ggml_supertonic_edge_pad_1d_impl(ctx, x, pad_left, pad_right, /*layout=*/1);
+}
+
 // ggml_roll
 
 struct ggml_tensor * ggml_roll(