raullenchai · raullenchai · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/evals/README.md b/evals/README.md
@@ -25,24 +25,28 @@ Different models require different server flags for tool calling. Use the correc
 | Model Family | Server Flags |
 |-------------|-------------|
 | **Qwen / Hermes** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser hermes` |
-| **GPT-OSS** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser minimax` |
+| **GPT-OSS (Harmony)** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser harmony` |
 | **MiniMax** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser minimax` |
+| **DeepSeek V3.1 / R1-0528** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser deepseek_v31` |
 | **GLM-4** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser glm47` |
+| **Qwen3-Coder (XML)** | `vllm-mlx serve <model> --port 8000 --enable-auto-tool-choice --tool-call-parser qwen3_coder_xml` |
 | **Other / No tools** | `vllm-mlx serve <model> --port 8000` |
 
 Then pass the matching `--parser` to the eval script:
 ```bash
 python evals/run_eval.py --model "X" --parser hermes    # for Qwen/Hermes models
-python evals/run_eval.py --model "X" --parser minimax   # for GPT-OSS models
+python evals/run_eval.py --model "X" --parser harmony    # for GPT-OSS (Harmony) models
 python evals/run_eval.py --model "X" --parser minimax   # for MiniMax models
+python evals/run_eval.py --model "X" --parser deepseek_v31  # for DeepSeek V3.1 / R1-0528
 python evals/run_eval.py --model "X" --parser glm47     # for GLM-4 models
+python evals/run_eval.py --model "X" --parser qwen3_coder_xml  # for Qwen3-Coder (XML)
 ```
 
 ## Eval Suites
 
 | Suite | Items | What it tests | Scoring |
 |-------|-------|---------------|---------|
-| **Speed** | 4 metrics | TTFT cold/warm, decode tok/s short/long | Absolute numbers |
+| **Speed** | 6 metrics | TTFT cold/warm, decode tok/s short/long, RAM active/peak | Absolute numbers |
 | **Tool Calling** | 30 scenarios | Tool detection, parallel calls, irrelevance, error recovery | % fully correct |
 | **Coding** | 10 tasks | HumanEval+ problems (medium-hard) | % tests pass |
 | **Reasoning** | 10 problems | MATH-500 competition math (levels 2-5, fractions + integers) | % correct answer |

diff --git a/evals/run_all_models.sh b/evals/run_all_models.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# Batch eval runner — runs ALL suites for all text LLMs
+# Usage: bash evals/run_all_models.sh [suite1 suite2 ...]
+# Examples:
+#   bash evals/run_all_models.sh                    # all suites
+#   bash evals/run_all_models.sh speed tool_calling # specific suites
+# NOTE: Model paths below are machine-specific. Update them to match your
+#       local model directory before running.
+# No set -e: server kill/wait returns non-zero which is expected
+
+PYTHON=python3.12
+CLI_CMD="from vllm_mlx.cli import main; import sys; sys.argv = ['vllm-mlx'] + sys.argv[1:]; main()"
+PORT=8000
+EVAL_CMD="$PYTHON evals/run_eval.py"
+
+# Suites to run (all by default, or from command line)
+if [ $# -gt 0 ]; then
+  SUITES="$*"
+else
+  SUITES="speed tool_calling coding reasoning general"
+fi
+
+# Model configs: name|path|parser|quantization
+declare -a MODELS=(
+  "Qwen3-0.6B-4bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3-0.6B-MLX-4bit|hermes|4bit"
+  "GLM-4.7-4bit|/Users/raullenstudio/.lmstudio/models/mlx-community/GLM-4.7-4bit|glm47|4bit"
+  "GPT-OSS-20B-mxfp4-q8|/Users/raullenstudio/.lmstudio/models/mlx-community/gpt-oss-20b-MXFP4-Q8|harmony|mxfp4-q8"
+  "MiniMax-M2.5-4bit|/Users/raullenstudio/.lmstudio/models/lmstudio-community/MiniMax-M2.5-MLX-4bit|minimax|4bit"
+  "Qwen3.5-35B-A3B-4bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3.5-35B-A3B-4bit|hermes|4bit"
+  "Qwen3.5-35B-A3B-8bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3.5-35B-A3B-8bit|hermes|8bit"
+  "Qwen3-Coder-Next-4bit|/Users/raullenstudio/.lmstudio/models/lmstudio-community/Qwen3-Coder-Next-MLX-4bit|hermes|4bit"
+  "Qwen3-Coder-Next-6bit|/Users/raullenstudio/.lmstudio/models/lmstudio-community/Qwen3-Coder-Next-MLX-6bit|hermes|6bit"
+  "Qwen3.5-122B-A10B-mxfp4|/Users/raullenstudio/.lmstudio/models/nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx|hermes|mxfp4"
+  "Qwen3.5-122B-A10B-8bit|/Users/raullenstudio/.lmstudio/models/mlx-community/Qwen3.5-122B-A10B-8bit|hermes|8bit"
+  # Requested by community — download and uncomment to eval:
+  # "Mistral-Small-3.2-4bit|<path>|hermes|4bit"
+  # "Devstral-Small-4bit|<path>|hermes|4bit"
+  # "GLM-4.5-Air-4bit|<path>|glm47|4bit"
+  # "Nemotron-Nano-30B-4bit|<path>|hermes|4bit"
+  # "Qwen3.5-4B-4bit|<path>|hermes|4bit"
+  # "Qwen3.5-9B-4bit|<path>|hermes|4bit"
+)
+
+start_server() {
+  local model_path="$1"
+  local parser="$2"
+  echo "  Starting server: $(basename "$model_path") (parser=$parser)..."
+  $PYTHON -c "$CLI_CMD" serve "$model_path" --port $PORT \
+    --enable-auto-tool-choice --tool-call-parser "$parser" &
+  SERVER_PID=$!
+
+  for i in $(seq 1 120); do
+    if curl -s "http://localhost:$PORT/health" | grep -q "healthy"; then
+      echo "  Server ready (${i}s)"
+      return 0
+    fi
+    sleep 2
+  done
+  echo "  ERROR: Server failed to start within 240s"
+  kill $SERVER_PID 2>/dev/null
+  return 1
+}
+
+stop_server() {
+  if [ -n "$SERVER_PID" ]; then
+    kill $SERVER_PID 2>/dev/null
+    wait $SERVER_PID 2>/dev/null
+    SERVER_PID=""
+  fi
+  lsof -ti:$PORT | xargs kill 2>/dev/null || true
+  sleep 3
+}
+
+echo "========================================"
+echo "vllm-mlx Full Model Evaluation"
+echo "========================================"
+echo "Models: ${#MODELS[@]}"
+echo "Suites: $SUITES"
+echo ""
+
+TOTAL_START=$(date +%s)
+
+for model_config in "${MODELS[@]}"; do
+  IFS='|' read -r name path parser quant <<< "$model_config"
+
+  # Skip if model path doesn't exist
+  if [ ! -d "$path" ]; then
+    echo "SKIP: $name (path not found: $path)"
+    echo ""
+    continue
+  fi
+
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  echo "Model: $name ($quant, parser=$parser)"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+  stop_server
+
+  if start_server "$path" "$parser"; then
+    $EVAL_CMD \
+      --model "$name" \
+      --parser "$parser" \
+      --quantization "$quant" \
+      --suite $SUITES \
+      --server-flags "--enable-auto-tool-choice --tool-call-parser $parser"
+    echo ""
+  else
+    echo "  SKIPPED: $name (server failed to start)"
+    echo ""
+  fi
+
+  stop_server
+done
+
+TOTAL_END=$(date +%s)
+TOTAL_ELAPSED=$((TOTAL_END - TOTAL_START))
+MINUTES=$((TOTAL_ELAPSED / 60))
+
+echo "========================================"
+echo "All evals complete in ${MINUTES}m ${TOTAL_ELAPSED}s"
+echo "========================================"
+echo ""
+echo "Results:"
+ls -la evals/results/*.json
+echo ""
+echo "Regenerate scorecard with: python3.12 evals/generate_scorecard.py"
diff --git a/tests/test_harmony_parsers.py b/tests/test_harmony_parsers.py
@@ -1323,14 +1323,18 @@ def test_invalid_parser_not_registered(self):
 
 
 class TestHarmonyNativeFormat:
-    """Test that Harmony parser correctly declares no native format support."""
+    """Test that Harmony parser declares native format support.
 
-    def test_supports_native_format_false(self):
-        """HarmonyToolParser does not support native tool format."""
-        assert HarmonyToolParser.SUPPORTS_NATIVE_TOOL_FORMAT is False
-        assert HarmonyToolParser.supports_native_format() is False
+    GPT-OSS chat templates natively handle tool_calls and role='tool'
+    messages using harmony channel tokens.
+    """
+
+    def test_supports_native_format_true(self):
+        """HarmonyToolParser supports native tool format."""
+        assert HarmonyToolParser.SUPPORTS_NATIVE_TOOL_FORMAT is True
+        assert HarmonyToolParser.supports_native_format() is True
 
     def test_instance_supports_native_format(self):
-        """Instance-level check also returns False."""
+        """Instance-level check also returns True."""
         parser = HarmonyToolParser()
-        assert parser.supports_native_format() is False
+        assert parser.supports_native_format() is True
diff --git a/tests/test_native_tool_format.py b/tests/test_native_tool_format.py
@@ -13,6 +13,7 @@
     DeepSeekToolParser,
     FunctionaryToolParser,
     GraniteToolParser,
+    HarmonyToolParser,
     HermesToolParser,
     KimiToolParser,
     LlamaToolParser,
@@ -37,6 +38,7 @@ def test_parsers_with_native_support(self):
             FunctionaryToolParser,
             KimiToolParser,
             HermesToolParser,
+            HarmonyToolParser,
         ]
         for parser_cls in native_parsers:
             assert (
@@ -73,6 +75,7 @@ def test_via_manager(self):
             "functionary",
             "kimi",
             "hermes",
+            "harmony",
         ]:
             parser_cls = ToolParserManager.get_tool_parser(name)
             assert (