Skip to content
Merged
642 changes: 166 additions & 476 deletions ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c

Large diffs are not rendered by default.

13 changes: 6 additions & 7 deletions ggml/src/ggml-hexagon/htp/hmx-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,28 +33,27 @@ typedef struct {
size_t src1_nb3;
size_t dst_nb2;
size_t dst_nb3;
} hmx_matmul_w16a32_batched_params_t;
} hmx_matmul_f16_f32_batched_params_t;

// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
// act_stride: activation row stride in elements (= k for contiguous, or
// nb[1]/sizeof(float) for permuted tensors like attention Q).
// weight_stride: weight row stride in elements (= k for compact weights, or
// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx,
int hmx_matmul_f16_f32(struct htp_context *ctx,
float *restrict dst,
const float *activation,
const __fp16 *permuted_weight,
int m, int k, int n,
int act_stride,
int weight_stride);

// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32.
// Batched F16 wrapper over hmx_mat_mul_f16_f32.
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx,
const hmx_matmul_w16a32_batched_params_t *params);
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);

// HMX matrix multiplication — tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL)
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx,
// HMX matrix multiplication — quantised weights (Q4_0/Q8_0/IQ4_NL/MXFP4)
int hmx_matmul_q_f32(struct htp_context *ctx,
float *restrict dst,
const float *activation,
const uint8_t *permuted_weight,
Expand Down
38 changes: 20 additions & 18 deletions ggml/src/ggml-hexagon/htp/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,35 +87,37 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
}
}

#if __HVX_ARCH__ >= 75
{
// Power on HMX
// Power on HMX and set HMX clock
HAP_power_request_t request;
memset(&request, 0, sizeof(HAP_power_request_t));
request.type = HAP_power_set_HMX;
request.hmx.power_up = TRUE;
FARF(ALWAYS, "Powering HMX on\n");
err = HAP_power_set((void *) &ctx, &request);
request.type = HAP_power_set_HMX_v2;
request.hmx_v2.set_power = TRUE;
request.hmx_v2.power_up = TRUE;
request.hmx_v2.set_clock = TRUE;
request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
FARF(ALWAYS, "Setting HMX clock\n");
err = HAP_power_set((void *) ctx, &request);
if (err != AEE_SUCCESS) {
FARF(ERROR, "Error powering on HMX.");
FARF(ERROR, "Error setting HMX clock.");
return err;
}
}

#if __HVX_ARCH__ >= 75
#else
{
// Set HMX clock
// Power on HMX
HAP_power_request_t request;
memset(&request, 0, sizeof(HAP_power_request_t));
request.type = HAP_power_set_HMX_v2;
request.hmx_v2.set_clock = TRUE;
request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
FARF(ALWAYS, "Setting HMX clock\n");
err = HAP_power_set((void *) &ctx, &request);
request.type = HAP_power_set_HMX;
request.hmx.power_up = TRUE;
FARF(ALWAYS, "Powering HMX on\n");
err = HAP_power_set((void *) ctx, &request);
if (err != AEE_SUCCESS) {
FARF(ERROR, "Error setting HMX clock.");
FARF(ERROR, "Error powering on HMX.");
return err;
}
}
Expand Down
10 changes: 4 additions & 6 deletions ggml/src/ggml-hexagon/htp/matmul-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -2995,7 +2995,6 @@ int op_matmul(struct htp_ops_context * octx) {
// is handled by HMX itself; when M < 32 fall back to HVX.
const int m_total = (int) src1->ne[1];
const int m_hmx = m_total & ~31; // 0 when M < 32

if (m_hmx == 0) {
return op_matmul_hvx(octx);
}
Expand All @@ -3020,7 +3019,7 @@ int op_matmul(struct htp_ops_context * octx) {

if (src0->type == HTP_TYPE_F16) {
if (is_batched) {
hmx_matmul_w16a32_batched_params_t batch_params = {
hmx_matmul_f16_f32_batched_params_t batch_params = {
.dst = (float *) dst->data,
.activation = (float *) src1->data,
.permuted_weight = (const __fp16 *) src0->data,
Expand All @@ -3041,15 +3040,14 @@ int op_matmul(struct htp_ops_context * octx) {
.dst_nb2 = dst->nb[2],
.dst_nb3 = dst->nb[3],
};
ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params);
ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params);
} else {
ret = hmx_mat_mul_permuted_w16a32(octx->ctx,
ret = hmx_matmul_f16_f32(octx->ctx,
(float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
m_total, k, n, act_stride, wgt_stride);
}
} else {
ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx,
(float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
ret = hmx_matmul_q_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
m_total, k, n, (int) src0->type);
}

Expand Down
2 changes: 1 addition & 1 deletion scripts/snapdragon/adb/run-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ adb $adbserial $adbhost shell " \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$ndev $nhvx $opmask $verbose $profile $hb ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ubatch-size 256 -fa 1 -ngl 99 $cli_opts $@ \
--ubatch-size 1024 -fa 1 -ngl 99 $cli_opts $@ \
"
2 changes: 1 addition & 1 deletion scripts/snapdragon/adb/run-cli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,6 @@ adb $adbserial $adbhost shell " \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --ubatch-size 256 -fa on \
--ctx-size 8192 --ubatch-size 1024 -fa on \
-ngl 99 --device $device $cli_opts $@ \
"
4 changes: 2 additions & 2 deletions scripts/snapdragon/adb/run-completion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,6 @@ adb $adbserial $adbhost shell " \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --ubatch-size 256 -fa on \
-ngl 99 --device $device $cli_opts $@ \
--ctx-size 8192 --ubatch-size 1024 -fa on \
-ngl 99 --device $device $cli_opts $@ \
"
2 changes: 1 addition & 1 deletion scripts/snapdragon/adb/run-mtmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ adb $adbserial $adbhost shell " \
--mmproj $basedir/../gguf/$mmproj \
--image $basedir/../gguf/$image \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --ubatch-size 256 -fa on \
--ctx-size 8192 --ubatch-size 1024 -fa on \
-ngl 99 --device $device -v $cli_opts $@ \
"
2 changes: 1 addition & 1 deletion scripts/snapdragon/windows/run-bench.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
& "$basedir\bin\llama-bench.exe" `
--mmap 0 -m $basedir\..\..\gguf\$model `
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
--batch-size 128 -ngl 99 --device $device $cli_opts
--ubatch-size 1024 -ngl 99 --device $device $cli_opts
2 changes: 1 addition & 1 deletion scripts/snapdragon/windows/run-cli.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
& "$basedir\bin\llama-cli.exe" `
--no-mmap -m $basedir\..\..\gguf\$model `
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
--ctx-size 8192 --ubatch-size 256 -fa on `
--ctx-size 8192 --ubatch-size 1024 -fa on `
-ngl 99 --device $device $cli_opts
2 changes: 1 addition & 1 deletion scripts/snapdragon/windows/run-completion.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
& "$basedir\bin\llama-completion.exe" `
--no-mmap -m $basedir\..\..\gguf\$model `
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
--ctx-size 8192 --batch-size 256 -fa on `
--ctx-size 8192 --ubatch-size 1024 -fa on `
-ngl 99 -no-cnv --device $device $cli_opts
2 changes: 1 addition & 1 deletion scripts/snapdragon/windows/run-mtmd.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib"
--mmproj $basedir\..\..\gguf\$mmproj `
--image $basedir\..\..\gguf\$image `
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
--ctx-size 8192 --ubatch-size 256 -fa on `
--ctx-size 8192 --ubatch-size 1024 -fa on `
-ngl 99 --device $device -v $cli_opts
Loading