Skip to content

Commit d494bc1

Browse files
committed
fix(cli): expand model capability detection to include Llama, Nemotron, and Mistral models
The isModelCapable function was showing false warnings for Llama, Nemotron, and Mistral models, claiming they had "limited reasoning and tool calling capabilities" when they actually have excellent capabilities. **Changes:** - Added /llama/, /nemotron/, /mistral/ patterns to capability detection regex - Updated tests to reflect that these model families ARE capable - All tests passing (26/26) **Research validation:** - Llama 3.3/Nemotron: #1 on alignment benchmarks, Arena Hard 85.0 - Mistral: 81.2% MMLU, supports function calling and JSON mode - Both families widely used for agent workflows with proven tool calling **Impact:** - Removes false warnings for users of these popular model families - Enables proper multiEdit tool usage for capable models - Aligns detection with real-world model capabilities Tested with nvidia/Llama-3_3-Nemotron-Super-49B-v1 on MITRE AIP endpoints. Authored by: Aaron Lippold <[email protected]>
1 parent 6b7111c commit d494bc1

File tree

2 files changed

+52
-19
lines changed

2 files changed

+52
-19
lines changed

extensions/cli/src/utils/modelCapability.test.ts

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -53,26 +53,46 @@ describe("isModelCapable", () => {
5353
});
5454

5555
describe("Local/Ollama models", () => {
56-
test("should consider larger models as not capable", () => {
57-
expect(isModelCapable("ollama", "llama2-70b")).toBe(false);
58-
expect(isModelCapable("local", "codellama-34b")).toBe(false);
56+
test("should consider Llama models as capable (matches llama pattern)", () => {
57+
expect(isModelCapable("ollama", "llama2-70b")).toBe(true);
58+
expect(isModelCapable("ollama", "llama2-7b")).toBe(true);
59+
expect(isModelCapable("local", "codellama-34b")).toBe(true);
5960
});
6061

61-
test("should consider smaller models as less capable", () => {
62-
expect(isModelCapable("ollama", "llama2-7b")).toBe(false);
63-
expect(isModelCapable("local", "mistral-7b")).toBe(false);
62+
test("should consider Mistral models as capable (matches mistral pattern)", () => {
63+
expect(isModelCapable("local", "mistral-7b")).toBe(true);
64+
expect(isModelCapable("ollama", "mistral-small")).toBe(true);
65+
});
66+
67+
test("should consider non-capable local models as not capable", () => {
68+
expect(isModelCapable("ollama", "falcon-7b")).toBe(false);
69+
expect(isModelCapable("local", "starcoder-base")).toBe(false);
6470
});
6571
});
6672

6773
describe("Meta/Llama models", () => {
68-
test("should consider large Llama models as not capable", () => {
69-
expect(isModelCapable("llama", "llama-2-70b")).toBe(false);
70-
expect(isModelCapable("meta", "llama-65b")).toBe(false);
74+
test("should consider Llama models as capable", () => {
75+
expect(isModelCapable("openai", "Llama 3.3 70B")).toBe(true);
76+
expect(isModelCapable("openai", "Llama 3.3 Nemotron 49B")).toBe(true);
77+
expect(
78+
isModelCapable(
79+
"nvidia",
80+
"nvidia/Llama-3_3-Nemotron-Super-49B-v1",
81+
"nvidia/Llama-3_3-Nemotron-Super-49B-v1",
82+
),
83+
).toBe(true);
84+
expect(isModelCapable("meta", "llama-3.1-70b")).toBe(true);
85+
});
86+
87+
test("should consider Nemotron models as capable", () => {
88+
expect(isModelCapable("nvidia", "Llama 3.3 Nemotron 49B")).toBe(true);
89+
expect(isModelCapable("nvidia", "nemotron-4-340b")).toBe(true);
7190
});
7291

73-
test("should consider small Llama models as less capable", () => {
74-
expect(isModelCapable("llama", "llama-2-7b")).toBe(false);
75-
expect(isModelCapable("meta", "llama-13b")).toBe(false);
92+
test("should consider Mistral models as capable", () => {
93+
expect(isModelCapable("mistral", "Mistral Small 24B")).toBe(true);
94+
expect(isModelCapable("mistral", "mistral-large")).toBe(true);
95+
expect(isModelCapable("mistralai", "mistral-small-24b")).toBe(true);
7696
});
7797
});
7898

@@ -83,12 +103,13 @@ describe("isModelCapable", () => {
83103
});
84104

85105
describe("Hugging Face models", () => {
86-
test("should consider code-specific models as not capable", () => {
87-
expect(isModelCapable("huggingface", "codellama-instruct")).toBe(false);
88-
expect(isModelCapable("huggingface", "starcoder-base")).toBe(false);
106+
test("should consider Llama/Mistral models as capable even on HuggingFace", () => {
107+
expect(isModelCapable("huggingface", "codellama-instruct")).toBe(true);
108+
expect(isModelCapable("huggingface", "mistral-7b-instruct")).toBe(true);
89109
});
90110

91-
test("should consider general chat models as less capable", () => {
111+
test("should consider non-capable models as not capable", () => {
112+
expect(isModelCapable("huggingface", "starcoder-base")).toBe(false);
92113
expect(isModelCapable("huggingface", "falcon-7b")).toBe(false);
93114
});
94115
});
@@ -145,9 +166,11 @@ describe("isModelCapable", () => {
145166
});
146167

147168
test("should consider models not capable when neither name nor model match", () => {
148-
// Case where neither matches
149-
expect(isModelCapable("custom", "llama-7b", "local-model")).toBe(false);
169+
// Case where neither matches capable patterns
150170
expect(isModelCapable("custom", "falcon-7b", "random-model")).toBe(false);
171+
expect(isModelCapable("custom", "unknown-model", "local-model")).toBe(
172+
false,
173+
);
151174
});
152175
});
153176
});

extensions/cli/src/utils/modelCapability.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,17 @@ export function isModelCapable(
1010
const normalizedName = name.toLowerCase();
1111
const normalizedModel = model ? model.toLowerCase() : "";
1212

13-
const patterns = [/gemini/, /claude/, /gpt/, /o\d/, /kimi/, /qwen/];
13+
const patterns = [
14+
/gemini/,
15+
/claude/,
16+
/gpt/,
17+
/o\d/,
18+
/kimi/,
19+
/qwen/,
20+
/llama/,
21+
/nemotron/,
22+
/mistral/,
23+
];
1424

1525
// If either name OR model matches any of the patterns, consider it capable
1626
if (patterns.some((pattern) => pattern.test(normalizedName))) {

0 commit comments

Comments
 (0)