mybigday · jhen0409 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/cpp/common/build-info.cpp b/cpp/common/build-info.cpp
@@ -3,8 +3,8 @@
 #include <cstdio>
 #include <string>
 
-int LLAMA_BUILD_NUMBER = 9204;
-char const * LLAMA_COMMIT = "726704a";
+int LLAMA_BUILD_NUMBER = 9222;
+char const * LLAMA_COMMIT = "9a532ae";
 char const * LLAMA_COMPILER = "unknown";
 char const * LLAMA_BUILD_TARGET = "unknown";
 

diff --git a/cpp/ggml-ext.h b/cpp/ggml-ext.h
@@ -12,45 +12,45 @@
 // Meta backend
 //
 
-#define LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES 16
+#define LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES 16
 
-enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis {
+enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis {
     // tensor split by tensor dimensions:
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_0   =  0,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_1   =  1,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_2   =  2,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_3   =  3,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_0   =  0,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_1   =  1,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_2   =  2,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_3   =  3,
 
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
 
     // for internal bookkeeping only:
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_NONE     = 98,
-    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_UNKNOWN  = 99,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_NONE     = 98,
+    LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_SPLIT_AXIS_UNKNOWN  = 99,
 };
-LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API const char * lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis_name(enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis split_axis);
+LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API const char * lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis_name(enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis split_axis);
 
-struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state {
-    enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis axis;
+struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state {
+    enum lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_axis axis;
 
-    // for tensors with axis >= 0 && axis < LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_MAX_DIMS:
+    // for tensors with axis >= 0 && axis < LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_MAX_DIMS:
     //   - each device has a slice of the tensor along the split axis
     //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
     //   - some tensors have an inhomogenenous data layout along the split axis,
     //     those tensors are divided into segments which are each individually split across devices
-    //   - ne has one entry per segment and device that add up to lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor::ne for that axis,
+    //   - ne has one entry per segment and device that add up to lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor::ne for that axis,
     //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
     //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
     //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
-    int64_t  ne[16*LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES];
+    int64_t  ne[16*LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_BACKEND_META_MAX_DEVICES];
     uint32_t n_segments;
 };
 
 // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
-typedef struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state(*lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t)(const struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor * tensor, void * userdata);
+typedef struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_split_state(*lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t)(const struct lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor * tensor, void * userdata);
 
 // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
 // TODO: this looks a bit strange - a backend API creates a device. I think we should try
 //       express this as a backend registry functionality instead
-LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_device(
-    lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t * devs, size_t n_devs, lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_LM_GGML_API lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_device(
+    lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_dev_t * devs, size_t n_devs, lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
diff --git a/cpp/ggml-hexagon/ggml-hexagon.cpp b/cpp/ggml-hexagon/ggml-hexagon.cpp
@@ -2744,6 +2744,18 @@ static bool lm_ggml_hexagon_supported_ssm_conv(const struct lm_ggml_hexagon_sess
     return true;
 }
 
+static bool lm_ggml_hexagon_supported_pad(const struct lm_ggml_hexagon_session * sess, const struct lm_ggml_tensor * op) {
+    const struct lm_ggml_tensor * src0 = op->src[0];
+    const struct lm_ggml_tensor * dst  = op;
+
+    if (src0->type != LM_GGML_TYPE_F32 || dst->type != LM_GGML_TYPE_F32) {
+        return false;
+    }
+
+    LM_GGML_UNUSED(sess);
+    return true;
+}
+
 static bool lm_ggml_hexagon_supported_cumsum(const struct lm_ggml_hexagon_session * sess, const struct lm_ggml_tensor * op) {
     const struct lm_ggml_tensor * src0 = op->src[0];
     const struct lm_ggml_tensor * dst  = op;
@@ -2816,6 +2828,21 @@ static bool lm_ggml_hexagon_supported_solve_tri(const struct lm_ggml_hexagon_ses
     return true;
 }
 
+static bool lm_ggml_hexagon_supported_tri(const struct lm_ggml_hexagon_session * sess, const struct lm_ggml_tensor * op) {
+
+    const struct lm_ggml_tensor * src0 = op->src[0];
+    const struct lm_ggml_tensor * dst  = op;
+
+    if (src0->type != LM_GGML_TYPE_F32) { return false; }
+    if (dst->type  != LM_GGML_TYPE_F32) { return false; }
+    if (!lm_ggml_are_same_shape(src0, dst)) { return false; }
+    if (!lm_ggml_is_contiguous(src0) || !lm_ggml_is_contiguous(dst)) { return false; }
+
+    return true;
+
+    LM_GGML_UNUSED(sess);
+}
+
 static const char * lm_ggml_backend_hexagon_name(lm_ggml_backend_t backend) {
     auto sess = static_cast<lm_ggml_hexagon_session *>(backend->context);
     return sess->c_name();
@@ -2857,6 +2884,9 @@ static htp_op_code op_remap_to_htp(const lm_ggml_tensor * t) {
         case LM_GGML_OP_FILL:            return HTP_OP_FILL;
         case LM_GGML_OP_DIAG:            return HTP_OP_DIAG;
         case LM_GGML_OP_SOLVE_TRI:       return HTP_OP_SOLVE_TRI;
+        case LM_GGML_OP_TRI:             return HTP_OP_TRI;
+        case LM_GGML_OP_PAD:             return HTP_OP_PAD;
+
         case LM_GGML_OP_UNARY:
             switch (lm_ggml_get_unary_op(t)) {
                 case LM_GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -3416,6 +3446,14 @@ static bool lm_ggml_backend_hexagon_device_supports_op(lm_ggml_backend_dev_t dev
             supp = lm_ggml_hexagon_supported_solve_tri(sess, op);
             break;
 
+        case LM_GGML_OP_TRI:
+            supp = lm_ggml_hexagon_supported_tri(sess, op);
+            break;
+
+        case LM_GGML_OP_PAD:
+            supp = lm_ggml_hexagon_supported_pad(sess, op);
+            break;
+
         default:
             break;
     }

diff --git a/cpp/ggml-hexagon/htp/CMakeLists.txt b/cpp/ggml-hexagon/htp/CMakeLists.txt
@@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED
     diag-ops.c
     solve-tri-ops.c
     gated-delta-net-ops.c
+    pad-ops.c
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE

diff --git a/cpp/ggml-hexagon/htp/htp-ctx.h b/cpp/ggml-hexagon/htp/htp-ctx.h
@@ -107,5 +107,7 @@ int op_fill(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
 int op_solve_tri(struct htp_ops_context * octx);
 int op_gated_delta_net(struct htp_ops_context * octx);
+int op_tri(struct htp_ops_context * octx);
+int op_pad(struct htp_ops_context * octx);
 
 #endif /* HTP_CTX_H */
diff --git a/cpp/ggml-hexagon/htp/htp-msg.h b/cpp/ggml-hexagon/htp/htp-msg.h
@@ -28,7 +28,7 @@ enum htp_status {
     HTP_STATUS_VTCM_TOO_SMALL = 5,
 };
 
-// The values must match the lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_type.
+// The values must match the lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_type.
 // Duplicated here because we can't include full ggml.h in the htp build.
 // We have some static_asserts in the cpp code to ensure things are in sync.
 enum htp_data_type {
@@ -130,7 +130,7 @@ struct htp_tensor {
     uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
     uint32_t type;                // Data type
     uint32_t ne[HTP_MAX_DIMS];    // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor)
+    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_lm_ggml_tensor)
 };
 
 #define HTP_MAX_OP_PARAMS 64

diff --git a/cpp/ggml-hexagon/htp/htp-ops.h b/cpp/ggml-hexagon/htp/htp-ops.h
@@ -86,6 +86,8 @@ enum htp_op_code {
     HTP_OP_SOLVE_TRI,
     HTP_OP_L2_NORM,
     HTP_OP_GATED_DELTA_NET,
+    HTP_OP_TRI,
+    HTP_OP_PAD,
 
     HTP_OP_INVALID
 };

diff --git a/cpp/ggml-hexagon/htp/main.c b/cpp/ggml-hexagon/htp/main.c
@@ -595,9 +595,15 @@ static int execute_op(struct htp_ops_context * octx) {
         case HTP_OP_SOLVE_TRI:
             return op_solve_tri(octx);
 
+        case HTP_OP_PAD:
+            return op_pad(octx);
+
         case HTP_OP_GATED_DELTA_NET:
             return op_gated_delta_net(octx);
 
+        case HTP_OP_TRI:
+            return op_tri(octx);
+
         case HTP_OP_INVALID:
             break;