fix(cli): expand model capability detection to include Llama, Nemotron, and Mistral models

aaronlippold · aaronlippold · commit d494bc1728a8 · 2025-11-22T10:32:39.000-05:00
The isModelCapable function was showing false warnings for Llama, Nemotron, and Mistral models, claiming they had "limited reasoning and tool calling capabilities" when they actually have excellent capabilities. **Changes:** - Added /llama/, /nemotron/, /mistral/ patterns to capability detection regex - Updated tests to reflect that these model families ARE capable - All tests passing (26/26) **Research validation:** - Llama 3.3/Nemotron: #1 on alignment benchmarks, Arena Hard 85.0 - Mistral: 81.2% MMLU, supports function calling and JSON mode - Both families widely used for agent workflows with proven tool calling **Impact:** - Removes false warnings for users of these popular model families - Enables proper multiEdit tool usage for capable models - Aligns detection with real-world model capabilities Tested with nvidia/Llama-3_3-Nemotron-Super-49B-v1 on MITRE AIP endpoints. Authored by: Aaron Lippold <lippold@gmail.com>
diff --git a/extensions/cli/src/utils/modelCapability.test.ts b/extensions/cli/src/utils/modelCapability.test.ts
@@ -53,26 +53,46 @@ describe("isModelCapable", () => {
   });
 
   describe("Local/Ollama models", () => {
-    test("should consider larger models as not capable", () => {
-      expect(isModelCapable("ollama", "llama2-70b")).toBe(false);
-      expect(isModelCapable("local", "codellama-34b")).toBe(false);
+    test("should consider Llama models as capable (matches llama pattern)", () => {
+      expect(isModelCapable("ollama", "llama2-70b")).toBe(true);
+      expect(isModelCapable("ollama", "llama2-7b")).toBe(true);
+      expect(isModelCapable("local", "codellama-34b")).toBe(true);
     });
 
-    test("should consider smaller models as less capable", () => {
-      expect(isModelCapable("ollama", "llama2-7b")).toBe(false);
-      expect(isModelCapable("local", "mistral-7b")).toBe(false);
+    test("should consider Mistral models as capable (matches mistral pattern)", () => {
+      expect(isModelCapable("local", "mistral-7b")).toBe(true);
+      expect(isModelCapable("ollama", "mistral-small")).toBe(true);
+    });
+
+    test("should consider non-capable local models as not capable", () => {
+      expect(isModelCapable("ollama", "falcon-7b")).toBe(false);
+      expect(isModelCapable("local", "starcoder-base")).toBe(false);
     });
   });
 
   describe("Meta/Llama models", () => {
-    test("should consider large Llama models as not capable", () => {
-      expect(isModelCapable("llama", "llama-2-70b")).toBe(false);
-      expect(isModelCapable("meta", "llama-65b")).toBe(false);
+    test("should consider Llama models as capable", () => {
+      expect(isModelCapable("openai", "Llama 3.3 70B")).toBe(true);
+      expect(isModelCapable("openai", "Llama 3.3 Nemotron 49B")).toBe(true);
+      expect(
+        isModelCapable(
+          "nvidia",
+          "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+          "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+        ),
+      ).toBe(true);
+      expect(isModelCapable("meta", "llama-3.1-70b")).toBe(true);
+    });
+
+    test("should consider Nemotron models as capable", () => {
+      expect(isModelCapable("nvidia", "Llama 3.3 Nemotron 49B")).toBe(true);
+      expect(isModelCapable("nvidia", "nemotron-4-340b")).toBe(true);
     });
 
-    test("should consider small Llama models as less capable", () => {
-      expect(isModelCapable("llama", "llama-2-7b")).toBe(false);
-      expect(isModelCapable("meta", "llama-13b")).toBe(false);
+    test("should consider Mistral models as capable", () => {
+      expect(isModelCapable("mistral", "Mistral Small 24B")).toBe(true);
+      expect(isModelCapable("mistral", "mistral-large")).toBe(true);
+      expect(isModelCapable("mistralai", "mistral-small-24b")).toBe(true);
     });
   });
 
@@ -83,12 +103,13 @@ describe("isModelCapable", () => {
   });
 
   describe("Hugging Face models", () => {
-    test("should consider code-specific models as not capable", () => {
-      expect(isModelCapable("huggingface", "codellama-instruct")).toBe(false);
-      expect(isModelCapable("huggingface", "starcoder-base")).toBe(false);
+    test("should consider Llama/Mistral models as capable even on HuggingFace", () => {
+      expect(isModelCapable("huggingface", "codellama-instruct")).toBe(true);
+      expect(isModelCapable("huggingface", "mistral-7b-instruct")).toBe(true);
     });
 
-    test("should consider general chat models as less capable", () => {
+    test("should consider non-capable models as not capable", () => {
+      expect(isModelCapable("huggingface", "starcoder-base")).toBe(false);
       expect(isModelCapable("huggingface", "falcon-7b")).toBe(false);
     });
   });
@@ -145,9 +166,11 @@ describe("isModelCapable", () => {
     });
 
     test("should consider models not capable when neither name nor model match", () => {
-      // Case where neither matches
-      expect(isModelCapable("custom", "llama-7b", "local-model")).toBe(false);
+      // Case where neither matches capable patterns
       expect(isModelCapable("custom", "falcon-7b", "random-model")).toBe(false);
+      expect(isModelCapable("custom", "unknown-model", "local-model")).toBe(
+        false,
+      );
     });
   });
 });
diff --git a/extensions/cli/src/utils/modelCapability.ts b/extensions/cli/src/utils/modelCapability.ts
@@ -10,7 +10,17 @@ export function isModelCapable(
   const normalizedName = name.toLowerCase();
   const normalizedModel = model ? model.toLowerCase() : "";
 
-  const patterns = [/gemini/, /claude/, /gpt/, /o\d/, /kimi/, /qwen/];
+  const patterns = [
+    /gemini/,
+    /claude/,
+    /gpt/,
+    /o\d/,
+    /kimi/,
+    /qwen/,
+    /llama/,
+    /nemotron/,
+    /mistral/,
+  ];
 
   // If either name OR model matches any of the patterns, consider it capable
   if (patterns.some((pattern) => pattern.test(normalizedName))) {