feat: Improve llama.cpp argument handling and add device parsing tests (#6041)

qnixsynapse · web-flow · commit 5e533bdedc72 · 2025-08-04T19:47:04.000+05:30
* feat: Improve llama.cpp argument handling and add device parsing tests

This commit refactors how arguments are passed to llama.cpp,
specifically by only adding arguments when their values differ from
their defaults. This reduces the verbosity of the command and prevents
potential conflicts or errors when llama.cpp's default behavior aligns
with the desired setting.

Additionally, new tests have been added for parsing device output from
llama.cpp, ensuring the accurate extraction of GPU information (ID,
name, total memory, and free memory). This improves the robustness of
device detection.

The following changes were made:

* **Remove redundant `--ctx-size` argument:** The `--ctx-size`
    argument is now only explicitly added if `cfg.ctx_size` is greater
    than 0.
* **Conditional argument adding for default values:**
    * `--split-mode` is only added if `cfg.split_mode` is not empty
        and not 'layer'.
    * `--main-gpu` is only added if `cfg.main_gpu` is not undefined
        and not 0.
    * `--cache-type-k` is only added if `cfg.cache_type_k` is not 'f16'.
    * `--cache-type-v` is only added if `cfg.cache_type_v` is not 'f16'
        (when `flash_attn` is enabled) or not 'f32' (otherwise). This
        also corrects the `flash_attn` condition.
    * `--defrag-thold` is only added if `cfg.defrag_thold` is not 0.1.
    * `--rope-scaling` is only added if `cfg.rope_scaling` is not
        'none'.
    * `--rope-scale` is only added if `cfg.rope_scale` is not 1.
    * `--rope-freq-base` is only added if `cfg.rope_freq_base` is not 0.
    * `--rope-freq-scale` is only added if `cfg.rope_freq_scale` is
        not 1.
* **Add `parse_device_output` tests:** Comprehensive unit tests were
    added to `src-tauri/src/core/utils/extensions/inference_llamacpp_extension/server.rs`
    to validate the parsing of llama.cpp device output under various
    scenarios, including multiple devices, single devices, different
    backends (CUDA, Vulkan, SYCL), complex GPU names, and error
    conditions.

* fixup cache_type_v comparision
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
@@ -1247,11 +1247,6 @@ export default class llamacpp_extension extends AIEngine {
       ])
       args.push('--mmproj', mmprojPath)
     }
-
-    if (cfg.ctx_size !== undefined) {
-      args.push('-c', String(cfg.ctx_size))
-    }
-
     // Add remaining options from the interface
     if (cfg.chat_template) args.push('--chat-template', cfg.chat_template)
     const gpu_layers =
@@ -1263,8 +1258,9 @@ export default class llamacpp_extension extends AIEngine {
     if (cfg.batch_size > 0) args.push('--batch-size', String(cfg.batch_size))
     if (cfg.ubatch_size > 0) args.push('--ubatch-size', String(cfg.ubatch_size))
     if (cfg.device.length > 0) args.push('--device', cfg.device)
-    if (cfg.split_mode.length > 0) args.push('--split-mode', cfg.split_mode)
-    if (cfg.main_gpu !== undefined)
+    if (cfg.split_mode.length > 0 && cfg.split_mode != 'layer')
+      args.push('--split-mode', cfg.split_mode)
+    if (cfg.main_gpu !== undefined && cfg.main_gpu != 0)
       args.push('--main-gpu', String(cfg.main_gpu))
 
     // Boolean flags
@@ -1280,19 +1276,25 @@ export default class llamacpp_extension extends AIEngine {
     } else {
       if (cfg.ctx_size > 0) args.push('--ctx-size', String(cfg.ctx_size))
       if (cfg.n_predict > 0) args.push('--n-predict', String(cfg.n_predict))
-      args.push('--cache-type-k', cfg.cache_type_k)
+      if (cfg.cache_type_k && cfg.cache_type_k != 'f16')
+        args.push('--cache-type-k', cfg.cache_type_k)
       if (
-        (cfg.flash_attn && cfg.cache_type_v != 'f16') ||
-        cfg.cache_type_v != 'f32'
+        cfg.flash_attn &&
+        (cfg.cache_type_v != 'f16' && cfg.cache_type_v != 'f32')
       ) {
         args.push('--cache-type-v', cfg.cache_type_v)
       }
-      args.push('--defrag-thold', String(cfg.defrag_thold))
-
-      args.push('--rope-scaling', cfg.rope_scaling)
-      args.push('--rope-scale', String(cfg.rope_scale))
-      args.push('--rope-freq-base', String(cfg.rope_freq_base))
-      args.push('--rope-freq-scale', String(cfg.rope_freq_scale))
+      if (cfg.defrag_thold && cfg.defrag_thold != 0.1)
+        args.push('--defrag-thold', String(cfg.defrag_thold))
+
+      if (cfg.rope_scaling && cfg.rope_scaling != 'none')
+        args.push('--rope-scaling', cfg.rope_scaling)
+      if (cfg.rope_scale && cfg.rope_scale != 1)
+        args.push('--rope-scale', String(cfg.rope_scale))
+      if (cfg.rope_freq_base && cfg.rope_freq_base != 0)
+        args.push('--rope-freq-base', String(cfg.rope_freq_base))
+      if (cfg.rope_freq_scale && cfg.rope_freq_scale != 1)
+        args.push('--rope-freq-scale', String(cfg.rope_freq_scale))
     }
 
     logger.info('Calling Tauri command llama_load with args:', args)
diff --git a/src-tauri/src/core/utils/extensions/inference_llamacpp_extension/server.rs b/src-tauri/src/core/utils/extensions/inference_llamacpp_extension/server.rs
@@ -708,3 +708,195 @@ pub async fn is_process_running(pid: i32, state: State<'_, AppState>) -> Result<
 pub fn is_port_available(port: u16) -> bool {
     std::net::TcpListener::bind(("127.0.0.1", port)).is_ok()
 }
+
+// tests
+//
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_multiple_devices() {
+        let output = r#"ggml_vulkan: Found 2 Vulkan devices:
+ggml_vulkan: 0 = NVIDIA GeForce RTX 3090 (NVIDIA) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 49152 | int dot: 0 | matrix cores: KHR_coopmat
+ggml_vulkan: 1 = AMD Radeon Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 0 | matrix cores: KHR_coopmat
+Available devices:
+Vulkan0: NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)
+Vulkan1: AMD Radeon Graphics (RADV GFX1151) (87722 MiB, 87722 MiB free)
+"#;
+
+        let devices = parse_device_output(output).unwrap();
+
+        assert_eq!(devices.len(), 2);
+
+        // Check first device
+        assert_eq!(devices[0].id, "Vulkan0");
+        assert_eq!(devices[0].name, "NVIDIA GeForce RTX 3090");
+        assert_eq!(devices[0].mem, 24576);
+        assert_eq!(devices[0].free, 24576);
+
+        // Check second device
+        assert_eq!(devices[1].id, "Vulkan1");
+        assert_eq!(devices[1].name, "AMD Radeon Graphics (RADV GFX1151)");
+        assert_eq!(devices[1].mem, 87722);
+        assert_eq!(devices[1].free, 87722);
+    }
+
+    #[test]
+    fn test_parse_single_device() {
+        let output = r#"Available devices:
+CUDA0: NVIDIA GeForce RTX 4090 (24576 MiB, 24000 MiB free)"#;
+
+        let devices = parse_device_output(output).unwrap();
+
+        assert_eq!(devices.len(), 1);
+        assert_eq!(devices[0].id, "CUDA0");
+        assert_eq!(devices[0].name, "NVIDIA GeForce RTX 4090");
+        assert_eq!(devices[0].mem, 24576);
+        assert_eq!(devices[0].free, 24000);
+    }
+
+    #[test]
+    fn test_parse_with_extra_whitespace_and_empty_lines() {
+        let output = r#"
+Available devices:
+
+Vulkan0: NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)
+
+Vulkan1: AMD Radeon Graphics (RADV GFX1151) (87722 MiB, 87722 MiB free)
+
+"#;
+
+        let devices = parse_device_output(output).unwrap();
+
+        assert_eq!(devices.len(), 2);
+        assert_eq!(devices[0].id, "Vulkan0");
+        assert_eq!(devices[1].id, "Vulkan1");
+    }
+
+    #[test]
+    fn test_parse_different_backends() {
+        let output = r#"Available devices:
+CUDA0: NVIDIA GeForce RTX 4090 (24576 MiB, 24000 MiB free)
+Vulkan0: NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)
+SYCL0: Intel(R) Arc(TM) A750 Graphics (8000 MiB, 7721 MiB free)"#;
+
+        let devices = parse_device_output(output).unwrap();
+
+        assert_eq!(devices.len(), 3);
+
+        assert_eq!(devices[0].id, "CUDA0");
+        assert_eq!(devices[0].name, "NVIDIA GeForce RTX 4090");
+
+        assert_eq!(devices[1].id, "Vulkan0");
+        assert_eq!(devices[1].name, "NVIDIA GeForce RTX 3090");
+
+        assert_eq!(devices[2].id, "SYCL0");
+        assert_eq!(devices[2].name, "Intel(R) Arc(TM) A750 Graphics");
+        assert_eq!(devices[2].mem, 8000);
+        assert_eq!(devices[2].free, 7721);
+    }
+
+    #[test]
+    fn test_parse_complex_gpu_names() {
+        let output = r#"Available devices:
+Vulkan0: Intel(R) Arc(tm) A750 Graphics (DG2) (8128 MiB, 8128 MiB free)
+Vulkan1: AMD Radeon RX 7900 XTX (Navi 31) [RDNA 3] (24576 MiB, 24000 MiB free)"#;
+
+        let devices = parse_device_output(output).unwrap();
+
+        assert_eq!(devices.len(), 2);
+
+        assert_eq!(devices[0].id, "Vulkan0");
+        assert_eq!(devices[0].name, "Intel(R) Arc(tm) A750 Graphics (DG2)");
+        assert_eq!(devices[0].mem, 8128);
+        assert_eq!(devices[0].free, 8128);
+
+        assert_eq!(devices[1].id, "Vulkan1");
+        assert_eq!(devices[1].name, "AMD Radeon RX 7900 XTX (Navi 31) [RDNA 3]");
+        assert_eq!(devices[1].mem, 24576);
+        assert_eq!(devices[1].free, 24000);
+    }
+
+    #[test]
+    fn test_parse_no_devices() {
+        let output = r#"Available devices:"#;
+
+        let devices = parse_device_output(output).unwrap();
+        assert_eq!(devices.len(), 0);
+    }
+
+    #[test]
+    fn test_parse_missing_header() {
+        let output = r#"Vulkan0: NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)"#;
+
+        let result = parse_device_output(output);
+        assert!(result.is_err());
+        assert!(result
+            .unwrap_err()
+            .to_string()
+            .contains("Could not find 'Available devices:' section"));
+    }
+
+    #[test]
+    fn test_parse_malformed_device_line() {
+        let output = r#"Available devices:
+Vulkan0: NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)
+Invalid line without colon
+Vulkan1: AMD Radeon Graphics (RADV GFX1151) (87722 MiB, 87722 MiB free)"#;
+
+        let devices = parse_device_output(output).unwrap();
+
+        // Should skip the malformed line and parse the valid ones
+        assert_eq!(devices.len(), 2);
+        assert_eq!(devices[0].id, "Vulkan0");
+        assert_eq!(devices[1].id, "Vulkan1");
+    }
+
+    #[test]
+    fn test_parse_device_line_individual() {
+        // Test the individual line parser
+        let line = "Vulkan0: NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)";
+        let device = parse_device_line(line).unwrap().unwrap();
+
+        assert_eq!(device.id, "Vulkan0");
+        assert_eq!(device.name, "NVIDIA GeForce RTX 3090");
+        assert_eq!(device.mem, 24576);
+        assert_eq!(device.free, 24576);
+    }
+
+    #[test]
+    fn test_memory_pattern_detection() {
+        assert!(is_memory_pattern("24576 MiB, 24576 MiB free"));
+        assert!(is_memory_pattern("8000 MiB, 7721 MiB free"));
+        assert!(!is_memory_pattern("just some text"));
+        assert!(!is_memory_pattern("24576 MiB"));
+        assert!(!is_memory_pattern("24576, 24576"));
+    }
+
+    #[test]
+    fn test_parse_memory_value() {
+        assert_eq!(parse_memory_value("24576 MiB").unwrap(), 24576);
+        assert_eq!(parse_memory_value("7721 MiB free").unwrap(), 7721);
+        assert_eq!(parse_memory_value("8000").unwrap(), 8000);
+
+        assert!(parse_memory_value("").is_err());
+        assert!(parse_memory_value("not_a_number MiB").is_err());
+    }
+
+    #[test]
+    fn test_find_memory_pattern() {
+        let text = "NVIDIA GeForce RTX 3090 (24576 MiB, 24576 MiB free)";
+        let result = find_memory_pattern(text);
+        assert!(result.is_some());
+        let (_start, content) = result.unwrap();
+        assert_eq!(content, "24576 MiB, 24576 MiB free");
+
+        // Test with multiple parentheses
+        let text = "Intel(R) Arc(tm) A750 Graphics (DG2) (8128 MiB, 8128 MiB free)";
+        let result = find_memory_pattern(text);
+        assert!(result.is_some());
+        let (_start, content) = result.unwrap();
+        assert_eq!(content, "8128 MiB, 8128 MiB free");
+    }
+}