diff --git a/cpp/common/build-info.cpp b/cpp/common/build-info.cpp
index 201406bd..085332fc 100644
--- a/cpp/common/build-info.cpp
+++ b/cpp/common/build-info.cpp
@@ -3,8 +3,8 @@
 #include <cstdio>
 #include <string>
 
-int LLAMA_BUILD_NUMBER = 9204;
-char const * LLAMA_COMMIT = "726704a";
+int LLAMA_BUILD_NUMBER = 9222;
+char const * LLAMA_COMMIT = "9a532ae";
 char const * LLAMA_COMPILER = "unknown";
 char const * LLAMA_BUILD_TARGET = "unknown";
 
diff --git a/cpp/ggml-ext.h b/cpp/ggml-ext.h
index 04af5cda..321210bc 100644
--- a/cpp/ggml-ext.h
+++ b/cpp/ggml-ext.h
@@ -12,45 +12,45 @@
 // Meta backend
 //
 
-#define LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES 16
+#define LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES 16
 
-enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis {
+enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis {
     // tensor split by tensor dimensions:
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_0   =  0,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_1   =  1,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_2   =  2,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_3   =  3,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_0   =  0,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_1   =  1,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_2   =  2,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_3   =  3,
 
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
 
     // for internal bookkeeping only:
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_NONE     = 98,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_UNKNOWN  = 99,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_NONE     = 98,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_UNKNOWN  = 99,
 };
-LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API const char * lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis_name(enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis split_axis);
+LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API const char * lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis_name(enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis split_axis);
 
-struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state {
-    enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis axis;
+struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state {
+    enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis axis;
 
-    // for tensors with axis >= 0 && axis < LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_MAX_DIMS:
+    // for tensors with axis >= 0 && axis < LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_MAX_DIMS:
     //   - each device has a slice of the tensor along the split axis
     //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
     //   - some tensors have an inhomogenenous data layout along the split axis,
     //     those tensors are divided into segments which are each individually split across devices
-    //   - ne has one entry per segment and device that add up to lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor::ne for that axis,
+    //   - ne has one entry per segment and device that add up to lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor::ne for that axis,
     //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
     //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
     //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
-    int64_t  ne[16*LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES];
+    int64_t  ne[16*LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES];
     uint32_t n_segments;
 };
 
 // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
-typedef struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state(*lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t)(const struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor * tensor, void * userdata);
+typedef struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state(*lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t)(const struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor * tensor, void * userdata);
 
 // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
 // TODO: this looks a bit strange - a backend API creates a device. I think we should try
 //       express this as a backend registry functionality instead
-LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_device(
-    lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t * devs, size_t n_devs, lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_device(
+    lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t * devs, size_t n_devs, lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
diff --git a/cpp/ggml-hexagon/ggml-hexagon.cpp b/cpp/ggml-hexagon/ggml-hexagon.cpp
index 320c9b4d..a4cb5fc5 100644
--- a/cpp/ggml-hexagon/ggml-hexagon.cpp
+++ b/cpp/ggml-hexagon/ggml-hexagon.cpp
@@ -2744,6 +2744,18 @@ static bool lm_ggml_hexagon_supported_ssm_conv(const struct lm_ggml_hexagon_sess
     return true;
 }
 
+static bool lm_ggml_hexagon_supported_pad(const struct lm_ggml_hexagon_session * sess, const struct lm_ggml_tensor * op) {
+    const struct lm_ggml_tensor * src0 = op->src[0];
+    const struct lm_ggml_tensor * dst  = op;
+
+    if (src0->type != LM_GGML_TYPE_F32 || dst->type != LM_GGML_TYPE_F32) {
+        return false;
+    }
+
+    LM_GGML_UNUSED(sess);
+    return true;
+}
+
 static bool lm_ggml_hexagon_supported_cumsum(const struct lm_ggml_hexagon_session * sess, const struct lm_ggml_tensor * op) {
     const struct lm_ggml_tensor * src0 = op->src[0];
     const struct lm_ggml_tensor * dst  = op;
@@ -2816,6 +2828,21 @@ static bool lm_ggml_hexagon_supported_solve_tri(const struct lm_ggml_hexagon_ses
     return true;
 }
 
+static bool lm_ggml_hexagon_supported_tri(const struct lm_ggml_hexagon_session * sess, const struct lm_ggml_tensor * op) {
+
+    const struct lm_ggml_tensor * src0 = op->src[0];
+    const struct lm_ggml_tensor * dst  = op;
+
+    if (src0->type != LM_GGML_TYPE_F32) { return false; }
+    if (dst->type  != LM_GGML_TYPE_F32) { return false; }
+    if (!lm_ggml_are_same_shape(src0, dst)) { return false; }
+    if (!lm_ggml_is_contiguous(src0) || !lm_ggml_is_contiguous(dst)) { return false; }
+
+    return true;
+
+    LM_GGML_UNUSED(sess);
+}
+
 static const char * lm_ggml_backend_hexagon_name(lm_ggml_backend_t backend) {
     auto sess = static_cast<lm_ggml_hexagon_session *>(backend->context);
     return sess->c_name();
@@ -2857,6 +2884,9 @@ static htp_op_code op_remap_to_htp(const lm_ggml_tensor * t) {
         case LM_GGML_OP_FILL:            return HTP_OP_FILL;
         case LM_GGML_OP_DIAG:            return HTP_OP_DIAG;
         case LM_GGML_OP_SOLVE_TRI:       return HTP_OP_SOLVE_TRI;
+        case LM_GGML_OP_TRI:             return HTP_OP_TRI;
+        case LM_GGML_OP_PAD:             return HTP_OP_PAD;
+
         case LM_GGML_OP_UNARY:
             switch (lm_ggml_get_unary_op(t)) {
                 case LM_GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -3416,6 +3446,14 @@ static bool lm_ggml_backend_hexagon_device_supports_op(lm_ggml_backend_dev_t dev
             supp = lm_ggml_hexagon_supported_solve_tri(sess, op);
             break;
 
+        case LM_GGML_OP_TRI:
+            supp = lm_ggml_hexagon_supported_tri(sess, op);
+            break;
+
+        case LM_GGML_OP_PAD:
+            supp = lm_ggml_hexagon_supported_pad(sess, op);
+            break;
+
         default:
             break;
     }
diff --git a/cpp/ggml-hexagon/htp/CMakeLists.txt b/cpp/ggml-hexagon/htp/CMakeLists.txt
index bcadac11..36f92324 100644
--- a/cpp/ggml-hexagon/htp/CMakeLists.txt
+++ b/cpp/ggml-hexagon/htp/CMakeLists.txt
@@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED
     diag-ops.c
     solve-tri-ops.c
     gated-delta-net-ops.c
+    pad-ops.c
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
diff --git a/cpp/ggml-hexagon/htp/htp-ctx.h b/cpp/ggml-hexagon/htp/htp-ctx.h
index 92f02eac..6fe3e6c7 100644
--- a/cpp/ggml-hexagon/htp/htp-ctx.h
+++ b/cpp/ggml-hexagon/htp/htp-ctx.h
@@ -107,5 +107,7 @@ int op_fill(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
 int op_solve_tri(struct htp_ops_context * octx);
 int op_gated_delta_net(struct htp_ops_context * octx);
+int op_tri(struct htp_ops_context * octx);
+int op_pad(struct htp_ops_context * octx);
 
 #endif /* HTP_CTX_H */
diff --git a/cpp/ggml-hexagon/htp/htp-msg.h b/cpp/ggml-hexagon/htp/htp-msg.h
index f5b31a29..9ef4889f 100644
--- a/cpp/ggml-hexagon/htp/htp-msg.h
+++ b/cpp/ggml-hexagon/htp/htp-msg.h
@@ -28,7 +28,7 @@ enum htp_status {
     HTP_STATUS_VTCM_TOO_SMALL = 5,
 };
 
-// The values must match the lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_type.
+// The values must match the lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_type.
 // Duplicated here because we can't include full ggml.h in the htp build.
 // We have some static_asserts in the cpp code to ensure things are in sync.
 enum htp_data_type {
@@ -130,7 +130,7 @@ struct htp_tensor {
     uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
     uint32_t type;                // Data type
     uint32_t ne[HTP_MAX_DIMS];    // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor)
+    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor)
 };
 
 #define HTP_MAX_OP_PARAMS 64
diff --git a/cpp/ggml-hexagon/htp/htp-ops.h b/cpp/ggml-hexagon/htp/htp-ops.h
index fe2bcfd5..010cd706 100644
--- a/cpp/ggml-hexagon/htp/htp-ops.h
+++ b/cpp/ggml-hexagon/htp/htp-ops.h
@@ -86,6 +86,8 @@ enum htp_op_code {
     HTP_OP_SOLVE_TRI,
     HTP_OP_L2_NORM,
     HTP_OP_GATED_DELTA_NET,
+    HTP_OP_TRI,
+    HTP_OP_PAD,
 
     HTP_OP_INVALID
 };
diff --git a/cpp/ggml-hexagon/htp/main.c b/cpp/ggml-hexagon/htp/main.c
index 83baf230..868867b1 100644
--- a/cpp/ggml-hexagon/htp/main.c
+++ b/cpp/ggml-hexagon/htp/main.c
@@ -595,9 +595,15 @@ static int execute_op(struct htp_ops_context * octx) {
         case HTP_OP_SOLVE_TRI:
             return op_solve_tri(octx);
 
+        case HTP_OP_PAD:
+            return op_pad(octx);
+
         case HTP_OP_GATED_DELTA_NET:
             return op_gated_delta_net(octx);
 
+        case HTP_OP_TRI:
+            return op_tri(octx);
+
         case HTP_OP_INVALID:
             break;
 
diff --git a/cpp/ggml-hexagon/htp/pad-ops.c b/cpp/ggml-hexagon/htp/pad-ops.c
new file mode 100644
index 00000000..01d8550a
--- /dev/null
+++ b/cpp/ggml-hexagon/htp/pad-ops.c
@@ -0,0 +1,545 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+
+#include <string.h>
+
+#include "hex-dma.h"
+#include "hvx-utils.h"
+
+#define LM_GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-ops.h"
+
+/* Circular wrap: maps any integer x into [0, n) */
+static inline uint32_t wrap_around(int32_t x, uint32_t n) {
+    return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n);
+}
+
+/* Decompose a flat dst row index into (i1, i2, i3) */
+static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2,
+                                     uint32_t *i1, uint32_t *i2, uint32_t *i3) {
+    *i1 = ir % ne1;
+    *i2 = (ir / ne1) % ne2;
+    *i3 = ir / (ne1 * ne2);
+}
+
+/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */
+static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3,
+                                   int32_t lp1, int32_t rp1, uint32_t ne1,
+                                   int32_t lp2, int32_t rp2, uint32_t ne2,
+                                   int32_t lp3, int32_t rp3, uint32_t ne3) {
+    return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) &&
+           ((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) &&
+           ((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3);
+}
+
+/* Compute the DDR src row pointer for a zero-pad interior row */
+static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src,
+                                               uint32_t i1, uint32_t i2, uint32_t i3,
+                                               int32_t lp1, int32_t lp2, int32_t lp3) {
+    return (const uint8_t *) src->data
+        + (i1 - (uint32_t)lp1) * src->nb[1]
+        + (i2 - (uint32_t)lp2) * src->nb[2]
+        + (i3 - (uint32_t)lp3) * src->nb[3];
+}
+
+/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */
+static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src,
+                                                    uint32_t i1, uint32_t i2, uint32_t i3,
+                                                    int32_t lp1, int32_t lp2, int32_t lp3) {
+    return (const uint8_t *) src->data
+        + wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1]
+        + wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2]
+        + wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3];
+}
+
+struct htp_pad_context {
+    struct htp_ops_context * octx;
+
+    int32_t  lp0, rp0;
+    int32_t  lp1, rp1;
+    int32_t  lp2, rp2;
+    int32_t  lp3, rp3;
+
+    uint32_t nrows_per_thread;
+    uint32_t total_dst_rows;
+
+    size_t   type_size;
+
+    // Row sizes for DMA kernel (populated when VTCM is available)
+    size_t   src_row_size;
+    size_t   src_row_size_aligned;
+    size_t   dst_row_size;
+    size_t   dst_row_size_aligned;
+};
+
+#define htp_pad_preamble                            \
+    const struct htp_tensor * src = octx->src[0];   \
+    const struct htp_tensor * dst = octx->dst;      \
+                                                    \
+    const uint32_t ne00 = src->ne[0];               \
+    const uint32_t nb00 = src->nb[0];               \
+                                                    \
+    const uint32_t ne0 = dst->ne[0];                \
+    const uint32_t ne1 = dst->ne[1];                \
+    const uint32_t ne2 = dst->ne[2];                \
+    const uint32_t ne3 = dst->ne[3];                \
+                                                    \
+    const uint32_t nb1 = dst->nb[1];                \
+    const uint32_t nb2 = dst->nb[2];                \
+    const uint32_t nb3 = dst->nb[3];                \
+                                                    \
+    const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \
+    const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \
+    const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \
+    const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \
+                                                    \
+    const size_t type_size = pctx->type_size;       \
+                                                    \
+    const uint32_t row_start = pctx->nrows_per_thread * ith;                                 \
+    const uint32_t row_end   = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows);
+
+
+#define htp_pad_dma_preamble                                        \
+    const size_t src_row_size         = pctx->src_row_size;         \
+    const size_t src_row_size_aligned = pctx->src_row_size_aligned; \
+    const size_t dst_row_size         = pctx->dst_row_size;         \
+    const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \
+                                                                    \
+    uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \
+    uint8_t * dst_spad_base = octx->dst_spad.data  + ith * octx->dst_spad.size_per_thread;  \
+                                                                                            \
+    dma_queue * dma = octx->ctx->dma[ith];
+
+// ---------------------------------------------------------------------------
+// HVX vectorized PAD kernel
+// ---------------------------------------------------------------------------
+
+static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
+    struct htp_ops_context * octx = pctx->octx;
+    htp_pad_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
+        uint32_t i1, i2, i3;
+        pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3);
+
+        uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
+
+        const int interior = pad_is_interior(i1, i2, i3,
+                                             lp1, rp1, ne1,
+                                             lp2, rp2, ne2,
+                                             lp3, rp3, ne3);
+
+        if (!interior) {
+            hvx_splat_f32_u(dst_ptr, 0.0f, ne0);
+        } else {
+            const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3);
+
+            if (lp0 > 0) {
+                hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0);
+            }
+
+            uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size;
+            if (nb00 == type_size) {
+                hvx_copy_f32_uu(dst_row_start, src_ptr, ne00);
+            } else {
+                for (uint32_t i = 0; i < ne00; i++) {
+                    memcpy(dst_row_start + i * type_size,
+                           src_ptr + (size_t)i * nb00,
+                           type_size);
+                }
+            }
+
+            if (rp0 > 0) {
+                hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
+         ith, nth,
+         src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         row_start, row_end,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// ---------------------------------------------------------------------------
+// HVX + DMA PAD kernel — aligned, double-buffered
+// ---------------------------------------------------------------------------
+
+static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
+    struct htp_ops_context * octx = pctx->octx;
+    htp_pad_preamble;
+    htp_pad_dma_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    // -----------------------------------------------------------------------
+    // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the
+    // double-buffer pipeline before the main loop begins.
+    // -----------------------------------------------------------------------
+    for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) {
+        uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned;
+        uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned;
+
+        dma_queue_push_vtcm_to_ddr(dma,
+            dma_make_ptr((uint8_t *)dst->data, dst_spad_cur),
+            dst_row_size, dst_row_size_aligned, 0);
+
+        uint32_t i1, i2, i3;
+        pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
+        const int interior = pad_is_interior(i1, i2, i3,
+                                             lp1, rp1, ne1,
+                                             lp2, rp2, ne2,
+                                             lp3, rp3, ne3);
+
+        const uint8_t * src_ptr = interior
+            ? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL;
+
+        // Interior row: real DMA (1 row) from DDR to VTCM.
+        // Border row: null DMA (nrows=0)
+        dma_queue_push_ddr_to_vtcm(dma,
+            dma_make_ptr(src_spad_cur,
+                         src_ptr ? src_ptr : (const uint8_t *)src_spad_cur),
+            src_row_size_aligned, src_row_size, src_ptr ? 1 : 0);
+    }
+
+    // -----------------------------------------------------------------------
+    // Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops,
+    // push dst DMA and prefetch src for the next+1 row.
+    // -----------------------------------------------------------------------
+    for (uint32_t ir = row_start; ir < row_end; ir++) {
+        uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src;
+        uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst;
+
+        uint32_t i1, i2, i3;
+        pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
+
+        uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
+
+        const int interior = pad_is_interior(i1, i2, i3,
+                                             lp1, rp1, ne1,
+                                             lp2, rp2, ne2,
+                                             lp3, rp3, ne3);
+
+        if (!interior) {
+            hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0);
+        } else {
+            hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0);
+
+            uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size;
+
+            if ((uintptr_t)dst_interior % VLEN == 0) {
+                hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00);
+            } else {
+                hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00);
+            }
+        }
+
+        dma_queue_push_vtcm_to_ddr(dma,
+            dma_make_ptr(dst_ptr, dst_spad_cur),
+            dst_row_size, dst_row_size_aligned, 1);
+
+        const uint32_t next_row = ir + 2;
+        if (next_row < row_end) {
+            uint32_t ni1, ni2, ni3;
+            pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3);
+            const int next_interior = pad_is_interior(ni1, ni2, ni3,
+                                                      lp1, rp1, ne1,
+                                                      lp2, rp2, ne2,
+                                                      lp3, rp3, ne3);
+            const uint8_t * next_src_ptr = next_interior
+                ? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL;
+
+            dma_queue_push_ddr_to_vtcm(dma,
+                dma_make_ptr(src_spad_cur,
+                             next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur),
+                src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0);
+        }
+    }
+
+    dma_queue_flush(dma);
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
+         ith, nth,
+         src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         row_start, row_end,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// ---------------------------------------------------------------------------
+// HVX circular PAD kernel
+// ---------------------------------------------------------------------------
+
+static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
+    struct htp_ops_context * octx = pctx->octx;
+    htp_pad_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
+        uint32_t i1, i2, i3;
+        pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3);
+
+        uint8_t       * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
+        const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3);
+
+        if (nb00 == type_size) {
+
+            if (lp0 > 0) {
+                if ((uint32_t)lp0 < 32) {
+                    memcpy(dst_ptr,
+                           src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size,
+                           (size_t)lp0 * type_size);
+                } else {
+                    hvx_copy_f32_uu(dst_ptr,
+                                    src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size,
+                                    (uint32_t)lp0);
+                }
+            }
+            hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00);
+            if (rp0 > 0) {
+                if ((uint32_t)rp0 < 32) {
+                    memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size,
+                           src_row,
+                           (size_t)rp0 * type_size);
+                } else {
+                    hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size,
+                                    src_row,
+                                    (uint32_t)rp0);
+                }
+            }
+        } else {
+            for (uint32_t i = 0; i < (uint32_t)lp0; i++) {
+                *(float *)(dst_ptr + i * type_size) =
+                    *(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00);
+            }
+            for (uint32_t i = 0; i < ne00; i++) {
+                *(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) =
+                    *(const float *)(src_row + (size_t)i * nb00);
+            }
+            for (uint32_t i = 0; i < (uint32_t)rp0; i++) {
+                *(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) =
+                    *(const float *)(src_row + (size_t)i * nb00);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
+         ith, nth,
+         src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         row_start, row_end,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// ---------------------------------------------------------------------------
+// HVX + DMA circular PAD kernel — aligned, double-buffered
+// ---------------------------------------------------------------------------
+
+static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
+    struct htp_ops_context * octx = pctx->octx;
+    htp_pad_preamble;
+    htp_pad_dma_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    // -----------------------------------------------------------------------
+    // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the
+    // double-buffer pipeline.  Every row is a real src DMA (no null DMAs).
+    // -----------------------------------------------------------------------
+    for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) {
+        uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned;
+        uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned;
+
+        dma_queue_push_vtcm_to_ddr(dma,
+            dma_make_ptr((uint8_t *)dst->data, dst_spad_cur),
+            dst_row_size, dst_row_size_aligned, 0);
+
+        uint32_t pi1, pi2, pi3;
+        pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3);
+        dma_queue_push_ddr_to_vtcm(dma,
+            dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)),
+            src_row_size_aligned, src_row_size, 1);
+    }
+
+    // -----------------------------------------------------------------------
+    // Main loop: pop completed DMAs, assemble circular row in VTCM with
+    // aligned HVX ops, push dst DMA and prefetch src for the next+1 row.
+    // -----------------------------------------------------------------------
+    for (uint32_t ir = row_start; ir < row_end; ir++) {
+        uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src;
+        uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst;
+
+        uint32_t i1, i2, i3;
+        pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
+        uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
+
+
+        if (lp0 > 0) {
+            uint8_t * dst_left       = dst_spad_cur;
+            const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size;
+            if ((uint32_t)lp0 < 32) {
+                memcpy(dst_left, src_left, (size_t)lp0 * type_size);
+            } else {
+                hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0);
+            }
+        }
+
+        {
+            uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size;
+            if ((uintptr_t)dst_mid % VLEN == 0) {
+                hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00);
+            } else {
+                hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00);
+            }
+        }
+
+        if (rp0 > 0) {
+            uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size;
+            if ((uint32_t)rp0 < 32) {
+                memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size);
+            } else {
+                if ((uintptr_t)dst_right % VLEN == 0) {
+                    hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0);
+                } else {
+                    hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0);
+                }
+            }
+        }
+
+        dma_queue_push_vtcm_to_ddr(dma,
+            dma_make_ptr(dst_ptr, dst_spad_cur),
+            dst_row_size, dst_row_size_aligned, 1);
+
+        const uint32_t next_row = ir + 2;
+        if (next_row < row_end) {
+            uint32_t nri1, nri2, nri3;
+            pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3);
+            dma_queue_push_ddr_to_vtcm(dma,
+                dma_make_ptr(src_spad_cur,
+                             pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)),
+                src_row_size_aligned, src_row_size, 1);
+        }
+    }
+
+    dma_queue_flush(dma);
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
+         ith, nth,
+         src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         row_start, row_end,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+int op_pad(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = octx->src[0];
+    const struct htp_tensor * dst  = octx->dst;
+
+    // Only F32 supported
+    size_t type_size;
+    switch (src0->type) {
+        case HTP_TYPE_F32: type_size = 4; break;
+        default:
+            FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    const int32_t lp0 = octx->op_params[0];
+    const int32_t rp0 = octx->op_params[1];
+    const int32_t lp1 = octx->op_params[2];
+    const int32_t rp1 = octx->op_params[3];
+    const int32_t lp2 = octx->op_params[4];
+    const int32_t rp2 = octx->op_params[5];
+    const int32_t lp3 = octx->op_params[6];
+    const int32_t rp3 = octx->op_params[7];
+    const int32_t circular = octx->op_params[8];
+
+    const uint32_t ne0  = dst->ne[0];
+    const uint32_t ne00 = src0->ne[0];
+
+    const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3];
+    const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1);
+
+    const size_t src_row_size         = (size_t)ne00 * type_size;
+    const size_t dst_row_size         = (size_t)ne0  * type_size;
+    const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN);
+    const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
+
+    // Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread
+    const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned);
+
+    const int use_dma = (src0->nb[0] == (uint32_t)type_size) &&
+                        (ne00 >= 512) &&
+                        (octx->ctx->vtcm_base != NULL) &&
+                        (octx->ctx->vtcm_size >= vtcm_needed);
+
+    if (use_dma) {
+        octx->src0_spad.size_per_thread = 2 * src_row_size_aligned;
+        octx->dst_spad.size_per_thread  = 2 * dst_row_size_aligned;
+        octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+        octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
+        octx->src0_spad.data = octx->ctx->vtcm_base;
+        octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+    }
+
+    struct htp_pad_context pctx = {
+        .octx             = octx,
+        .lp0 = lp0, .rp0 = rp0,
+        .lp1 = lp1, .rp1 = rp1,
+        .lp2 = lp2, .rp2 = rp2,
+        .lp3 = lp3, .rp3 = rp3,
+        .nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads,
+        .total_dst_rows   = total_dst_rows,
+        .type_size        = type_size,
+        .src_row_size         = src_row_size,
+        .src_row_size_aligned = src_row_size_aligned,
+        .dst_row_size         = dst_row_size,
+        .dst_row_size_aligned = dst_row_size_aligned,
+    };
+
+    FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n",
+         circular ? "-circ" : "",
+         use_dma   ? "-dma"  : "",
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+         dst->ne[0],  dst->ne[1],  dst->ne[2],  dst->ne[3],
+         lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+
+    if      (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); }
+    else if (circular)            { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular,     &pctx, n_threads); }
+    else if (use_dma)             { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma,          &pctx, n_threads); }
+    else                          { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx,              &pctx, n_threads); }
+
+    return HTP_STATUS_OK;
+}
+
diff --git a/cpp/ggml-hexagon/htp/unary-ops.c b/cpp/ggml-hexagon/htp/unary-ops.c
index e43bad90..079c545f 100644
--- a/cpp/ggml-hexagon/htp/unary-ops.c
+++ b/cpp/ggml-hexagon/htp/unary-ops.c
@@ -17,7 +17,6 @@
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-ops.h"
-#include "htp-ops.h"
 
 struct htp_unary_context {
     struct htp_ops_context * octx;
@@ -277,6 +276,95 @@ static void sigmoid_f32(const float * restrict src,
     }
 }
 
+static void tri_f32(const float * restrict src,
+                    float * restrict dst,
+                    uint8_t * restrict spad,
+                    const uint32_t num_rows,
+                    const uint32_t row_elems,
+                    const size_t   row_size,
+                    int32_t *      op_params,
+                    const uint32_t ir,
+                    const struct htp_unary_context * uctx) {
+
+    const int32_t ttype = op_params[0];
+    const HVX_Vector zero = hvx_vec_splat_f32(0.0f);
+    const uint32_t nvec  = row_elems / VLEN_FP32;
+    const uint32_t nloe  = row_elems % VLEN_FP32;
+
+    const uint32_t ne01 = uctx->octx->src[0]->ne[1];
+
+    for (uint32_t b = 0; b < num_rows; b++) {
+        const uint32_t abs_row = ir + b;
+        const uint32_t i01     = abs_row % ne01;
+
+        const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size);
+        HVX_Vector * restrict v_dst       = (HVX_Vector *) ((uint8_t *) dst + b * row_size);
+
+        uint32_t boundary;
+        int      keep_left;
+        switch (ttype) {
+            case 0: boundary = i01;     keep_left = 0; break;  // keep col >= row
+            case 1: boundary = i01 + 1; keep_left = 0; break;  // keep col > row
+            case 2: boundary = i01 + 1; keep_left = 1; break;  // keep col <= row
+            case 3: boundary = i01;     keep_left = 1; break;  // keep col < row
+            default: boundary = 0; keep_left = 0; break;
+        }
+        if (boundary > row_elems) boundary = row_elems;
+
+        // Full HVX vectors — each starts at a 128-byte aligned offset
+        for (uint32_t i = 0; i < nvec; i++) {
+            const uint32_t vec_start = i * VLEN_FP32;
+            const uint32_t vec_end   = vec_start + VLEN_FP32;
+            if (keep_left) {
+                if (vec_end <= boundary) {
+                    v_dst[i] = v_src[i];
+                } else if (vec_start >= boundary) {
+                    v_dst[i] = zero;
+                } else {
+                    HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
+                    v_dst[i]            = Q6_V_vmux_QVV(mask, v_src[i], zero);
+                }
+            } else {
+                if (vec_end <= boundary) {
+                    v_dst[i] = zero;
+                } else if (vec_start >= boundary) {
+                    v_dst[i] = v_src[i];
+                } else {
+                    HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
+                    v_dst[i]            = Q6_V_vmux_QVV(mask, zero, v_src[i]);
+                }
+            }
+        }
+
+        // Tail elements (row_elems not a multiple of VLEN_FP32)
+        if (nloe > 0) {
+            const uint32_t vec_start = nvec * VLEN_FP32;
+            const uint32_t vec_end   = vec_start + nloe;
+            HVX_Vector     tail_val;
+            if (keep_left) {
+                if (vec_end <= boundary) {
+                    tail_val = v_src[nvec];
+                } else if (vec_start >= boundary) {
+                    tail_val = zero;
+                } else {
+                    HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
+                    tail_val            = Q6_V_vmux_QVV(mask, v_src[nvec], zero);
+                }
+            } else {
+                if (vec_end <= boundary) {
+                    tail_val = zero;
+                } else if (vec_start >= boundary) {
+                    tail_val = v_src[nvec];
+                } else {
+                    HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
+                    tail_val            = Q6_V_vmux_QVV(mask, zero, v_src[nvec]);
+                }
+            }
+            hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val);
+        }
+    }
+}
+
 static void softplus_f32(const float * restrict src,
                          float * restrict dst,
                          uint8_t * restrict spad,
@@ -498,6 +586,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
             case HTP_OP_L2_NORM:
                 l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                 break;
+            case HTP_OP_TRI:
+                tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx);
+                break;
             default:
                 break;
         }
@@ -571,6 +662,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
         case HTP_OP_L2_NORM:
             op_type = "l2norm-f32";
             break;
+        case HTP_OP_TRI:
+            op_type = "tri-f32";
+            break;
+
         default:
             FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
             return HTP_STATUS_NO_SUPPORT;
@@ -640,6 +735,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     return err;
 }
 
+int op_tri(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    switch (octx->src[0]->type) {
+        case HTP_TYPE_F32:
+            err = execute_op_unary_f32(octx);
+            break;
+
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
+
 int op_unary(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
diff --git a/cpp/llama-context.cpp b/cpp/llama-context.cpp
index 539bfea4..09ecb6e4 100644
--- a/cpp/llama-context.cpp
+++ b/cpp/llama-context.cpp
@@ -64,8 +64,9 @@ llama_context::llama_context(
     cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
     cparams.yarn_beta_fast   = params.yarn_beta_fast   >= 0.0f ? params.yarn_beta_fast   : hparams.yarn_beta_fast;
     cparams.yarn_beta_slow   = params.yarn_beta_slow   >= 0.0f ? params.yarn_beta_slow   : hparams.yarn_beta_slow;
-    cparams.embeddings       = params.embeddings;
-    cparams.embeddings_pre_norm = false;
+    cparams.embeddings                  = params.embeddings;
+    cparams.embeddings_pre_norm         = false;
+    cparams.embeddings_pre_norm_masked  = false;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index f9479f2d..5574a847 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -8,7 +8,7 @@ PODS:
   - hermes-engine (0.82.0):
     - hermes-engine/Pre-built (= 0.82.0)
   - hermes-engine/Pre-built (0.82.0)
-  - llama-rn (0.12.0):
+  - llama-rn (0.12.1):
     - boost
     - DoubleConversion
     - fast_float
@@ -3026,7 +3026,7 @@ SPEC CHECKSUMS:
   fmt: bf3b0f2427f5c78a3d39ac34a7dbe72faabf986d
   glog: 5683914934d5b6e4240e497e0f4a3b42d1854183
   hermes-engine: 8642d8f14a548ab718ec112e9bebdfdd154138b5
-  llama-rn: c2fcbfb8b8ab124f1a2eda0036ac18d9ac19ac64
+  llama-rn: 088e2777302f9e857c465baad78fafece3787064
   RCT-Folly: 846fda9475e61ec7bcbf8a3fe81edfcaeb090669
   RCTDeprecation: 22bf66112da540a7d40e536366ddd8557934fca1
   RCTRequired: a0ed4dc41b35f79fbb6d8ba320e06882a8c792cf
@@ -3097,7 +3097,7 @@ SPEC CHECKSUMS:
   ReactAppDependencyProvider: c5c4f5280e4ae0f9f4a739c64c4260fe0b3edaf1
   ReactCodegen: 374f1c9242fbdd673b460d358b33860c0cc9d926
   ReactCommon: 25c7f94aee74ddd93a8287756a8ac0830a309544
-  RNAudioAPI: 8f309254a527a858541a692c2ef2db606ad44c14
+  RNAudioAPI: 8a9d346fac228321993ac9016b6a6c70fa9cf9fb
   RNCAsyncStorage: 29f0230e1a25f36c20b05f65e2eb8958d6526e82
   RNCClipboard: f538e2ba34c187a6597c2f17c4faa4e1cafae97c
   RNGestureHandler: f1dd7f92a0faa2868a919ab53bb9d66eb4ebfcf5
@@ -3107,4 +3107,4 @@ SPEC CHECKSUMS:
 
 PODFILE CHECKSUM: f32e4f0da8b7e7c7c4fcb98f38febf7145eee1d9
 
-COCOAPODS: 1.15.2
+COCOAPODS: 1.16.2
diff --git a/src/version.ts b/src/version.ts
index f6292555..41ec1cec 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1,2 +1,2 @@
-export const BUILD_NUMBER = '9204'
-export const BUILD_COMMIT = '726704a'
+export const BUILD_NUMBER = '9222'
+export const BUILD_COMMIT = '9a532ae'
diff --git a/third_party/llama.cpp b/third_party/llama.cpp
index 726704a1..9a532ae4 160000
--- a/third_party/llama.cpp
+++ b/third_party/llama.cpp
@@ -1 +1 @@
-Subproject commit 726704a160c7d86acff1ff11e04b8316cf69d951
+Subproject commit 9a532ae4bab1b164052ce60a738f78538b421c66