Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,28 @@ Different models require different server flags for tool calling. Use the correc
| Model Family | Server Flags |
|-------------|-------------|
| **Qwen / Hermes** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser hermes` |
| **GPT-OSS** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser minimax` |
| **GPT-OSS (Harmony)** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser harmony` |
| **MiniMax** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser minimax` |
| **DeepSeek V3.1 / R1-0528** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser deepseek_v31` |
| **GLM-4** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser glm47` |
| **Qwen3-Coder (XML)** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser qwen3_coder_xml` |
| **Other / No tools** | `vllm-mlx serve <model> --port 8000` |

Then pass the matching `--parser` to the eval script:
```bash
python evals/run_eval.py --model "X" --parser hermes # for Qwen/Hermes models
python evals/run_eval.py --model "X" --parser minimax # for GPT-OSS models
python evals/run_eval.py --model "X" --parser harmony # for GPT-OSS (Harmony) models
python evals/run_eval.py --model "X" --parser minimax # for MiniMax models
python evals/run_eval.py --model "X" --parser deepseek_v31 # for DeepSeek V3.1 / R1-0528
python evals/run_eval.py --model "X" --parser glm47 # for GLM-4 models
python evals/run_eval.py --model "X" --parser qwen3_coder_xml # for Qwen3-Coder (XML)
```

## Eval Suites

| Suite | Items | What it tests | Scoring |
|-------|-------|---------------|---------|
| **Speed** | 4 metrics | TTFT cold/warm, decode tok/s short/long | Absolute numbers |
| **Speed** | 6 metrics | TTFT cold/warm, decode tok/s short/long, RAM active/peak | Absolute numbers |
| **Tool Calling** | 30 scenarios | Tool detection, parallel calls, irrelevance, error recovery | % fully correct |
| **Coding** | 10 tasks | HumanEval+ problems (medium-hard) | % tests pass |
| **Reasoning** | 10 problems | MATH-500 competition math (levels 2-5, fractions + integers) | % correct answer |
Expand Down
126 changes: 126 additions & 0 deletions evals/run_all_models.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/bin/bash
# Batch eval runner — runs ALL suites for all text LLMs
# Usage: bash evals/run_all_models.sh [suite1 suite2 ...]
# Examples:
# bash evals/run_all_models.sh # all suites
# bash evals/run_all_models.sh speed tool_calling # specific suites
# NOTE: Model paths below are machine-specific. Update them to match your
# local model directory before running.
# No set -e: server kill/wait returns non-zero which is expected

PYTHON=python3.12
CLI_CMD="from vllm_mlx.cli import main; import sys; sys.argv = ['vllm-mlx'] + sys.argv[1:]; main()"
PORT=8000
EVAL_CMD="$PYTHON evals/run_eval.py"

# Suites to run (all by default, or from command line)
if [ $# -gt 0 ]; then
SUITES="$*"
else
SUITES="speed tool_calling coding reasoning general"
fi

# Model configs: name|path|parser|quantization
declare -a MODELS=(
"Qwen3-0.6B-4bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3-0.6B-MLX-4bit|hermes|4bit"
"GLM-4.7-4bit|/Users/raullenstudio/.lmstudio/models/mlx-community/GLM-4.7-4bit|glm47|4bit"
"GPT-OSS-20B-mxfp4-q8|/Users/raullenstudio/.lmstudio/models/mlx-community/gpt-oss-20b-MXFP4-Q8|harmony|mxfp4-q8"
"MiniMax-M2.5-4bit|/Users/raullenstudio/.lmstudio/models/lmstudio-community/MiniMax-M2.5-MLX-4bit|minimax|4bit"
"Qwen3.5-35B-A3B-4bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3.5-35B-A3B-4bit|hermes|4bit"
"Qwen3.5-35B-A3B-8bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3.5-35B-A3B-8bit|hermes|8bit"
"Qwen3-Coder-Next-4bit|/Users/raullenstudio/.lmstudio/models/lmstudio-community/Qwen3-Coder-Next-MLX-4bit|hermes|4bit"
"Qwen3-Coder-Next-6bit|/Users/raullenstudio/.lmstudio/models/lmstudio-community/Qwen3-Coder-Next-MLX-6bit|hermes|6bit"
"Qwen3.5-122B-A10B-mxfp4|/Users/raullenstudio/.lmstudio/models/nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx|hermes|mxfp4"
"Qwen3.5-122B-A10B-8bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3.5-122B-A10B-8bit|hermes|8bit"
# Requested by community — download and uncomment to eval:
# "Mistral-Small-3.2-4bit|<path>|hermes|4bit"
# "Devstral-Small-4bit|<path>|hermes|4bit"
# "GLM-4.5-Air-4bit|<path>|glm47|4bit"
# "Nemotron-Nano-30B-4bit|<path>|hermes|4bit"
# "Qwen3.5-4B-4bit|<path>|hermes|4bit"
# "Qwen3.5-9B-4bit|<path>|hermes|4bit"
)

start_server() {
local model_path="$1"
local parser="$2"
echo " Starting server: $(basename "$model_path") (parser=$parser)..."
$PYTHON -c "$CLI_CMD" serve "$model_path" --port $PORT \
--enable-auto-tool-choice --tool-call-parser "$parser" &
SERVER_PID=$!

for i in $(seq 1 120); do
if curl -s "http://localhost:$PORT/health" | grep -q "healthy"; then
echo " Server ready (${i}s)"
return 0
fi
sleep 2
done
echo " ERROR: Server failed to start within 240s"
kill $SERVER_PID 2>/dev/null
return 1
}

stop_server() {
if [ -n "$SERVER_PID" ]; then
kill $SERVER_PID 2>/dev/null
wait $SERVER_PID 2>/dev/null
SERVER_PID=""
fi
lsof -ti:$PORT | xargs kill 2>/dev/null || true
sleep 3
}

echo "========================================"
echo "vllm-mlx Full Model Evaluation"
echo "========================================"
echo "Models: ${#MODELS[@]}"
echo "Suites: $SUITES"
echo ""

TOTAL_START=$(date +%s)

for model_config in "${MODELS[@]}"; do
IFS='|' read -r name path parser quant <<< "$model_config"

# Skip if model path doesn't exist
if [ ! -d "$path" ]; then
echo "SKIP: $name (path not found: $path)"
echo ""
continue
fi

echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Model: $name ($quant, parser=$parser)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

stop_server

if start_server "$path" "$parser"; then
$EVAL_CMD \
--model "$name" \
--parser "$parser" \
--quantization "$quant" \
--suite $SUITES \
--server-flags "--enable-auto-tool-choice --tool-call-parser $parser"
echo ""
else
echo " SKIPPED: $name (server failed to start)"
echo ""
fi

stop_server
done

TOTAL_END=$(date +%s)
TOTAL_ELAPSED=$((TOTAL_END - TOTAL_START))
MINUTES=$((TOTAL_ELAPSED / 60))

echo "========================================"
echo "All evals complete in ${MINUTES}m ${TOTAL_ELAPSED}s"
echo "========================================"
echo ""
echo "Results:"
ls -la evals/results/*.json
echo ""
echo "Regenerate scorecard with: python3.12 evals/generate_scorecard.py"
18 changes: 11 additions & 7 deletions tests/test_harmony_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,14 +1323,18 @@ def test_invalid_parser_not_registered(self):


class TestHarmonyNativeFormat:
"""Test that Harmony parser correctly declares no native format support."""
"""Test that Harmony parser declares native format support.

def test_supports_native_format_false(self):
"""HarmonyToolParser does not support native tool format."""
assert HarmonyToolParser.SUPPORTS_NATIVE_TOOL_FORMAT is False
assert HarmonyToolParser.supports_native_format() is False
GPT-OSS chat templates natively handle tool_calls and role='tool'
messages using harmony channel tokens.
"""

def test_supports_native_format_true(self):
"""HarmonyToolParser supports native tool format."""
assert HarmonyToolParser.SUPPORTS_NATIVE_TOOL_FORMAT is True
assert HarmonyToolParser.supports_native_format() is True

def test_instance_supports_native_format(self):
"""Instance-level check also returns False."""
"""Instance-level check also returns True."""
parser = HarmonyToolParser()
assert parser.supports_native_format() is False
assert parser.supports_native_format() is True
3 changes: 3 additions & 0 deletions tests/test_native_tool_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DeepSeekToolParser,
FunctionaryToolParser,
GraniteToolParser,
HarmonyToolParser,
HermesToolParser,
KimiToolParser,
LlamaToolParser,
Expand All @@ -37,6 +38,7 @@ def test_parsers_with_native_support(self):
FunctionaryToolParser,
KimiToolParser,
HermesToolParser,
HarmonyToolParser,
]
for parser_cls in native_parsers:
assert (
Expand Down Expand Up @@ -73,6 +75,7 @@ def test_via_manager(self):
"functionary",
"kimi",
"hermes",
"harmony",
]:
parser_cls = ToolParserManager.get_tool_parser(name)
assert (
Expand Down
Loading