apify · jirispilka · Oct 16, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.env.example b/.env.example
@@ -1,3 +1,8 @@
 APIFY_TOKEN=
-# ANTHROPIC_API_KEY is only required when you want to run examples/clientStdioChat.js
-ANTHROPIC_API_KEY=
+
+# EVALS
+PHOENIX_API_KEY=
+PHOENIX_HOST=
+
+OPENROUTER_API_KEY=
+OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
diff --git a/.github/workflows/evaluations.yaml b/.github/workflows/evaluations.yaml
@@ -0,0 +1,46 @@
+# This workflow runs MCP tool calling evaluations on master branch merges
+# It evaluates AI models' ability to correctly identify and call MCP tools.
+
+name: MCP tool calling evaluations
+
+on:
+    # Run evaluations on master branch merges
+    push:
+        branches:
+            - 'master'
+    # Also run on PRs with 'evals' label for testing
+    pull_request:
+        types: [labeled, synchronize, reopened]
+
+jobs:
+    evaluations:
+        name: MCP tool calling evaluations
+        runs-on: ubuntu-latest
+        # Run on master pushes or PRs with 'evals' label
+        if: github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'validated')
+
+        steps:
+            -   name: Checkout code
+                uses: actions/checkout@v4
+
+            -   name: Use Node.js 22
+                uses: actions/setup-node@v4
+                with:
+                    node-version: 22
+                    cache: 'npm'
+                    cache-dependency-path: 'package-lock.json'
+
+            -   name: Install Node dependencies
+                run: npm ci --include=dev
+
+            -   name: Build project
+                run: npm run build
+
+            -   name: Run evaluations
+                run: npm run evals:run
+                env:
+                    GITHUB_PR_NUMBER: ${{ github.event_name == 'pull_request' && github.event.number || 'master' }}
+                    PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
+                    PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
+                    OPENROUTER_BASE_URL: ${{ secrets.OPENROUTER_BASE_URL }}
+                    OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,10 @@ key.pem
 
 # Ignore MCP config for Opencode client
 opencode.json
+
+# Python cache files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
diff --git a/eslint.config.mjs b/eslint.config.mjs
@@ -2,7 +2,7 @@ import apifyTypeScriptConfig from '@apify/eslint-config/ts.js';
 
 // eslint-disable-next-line import/no-default-export
 export default [
-    { ignores: ['**/dist'] }, // Ignores need to happen first
+    { ignores: ['**/dist', '**/.venv', 'evals/**'] }, // Ignores need to happen first
     ...apifyTypeScriptConfig,
     {
         languageOptions: {

diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,64 @@
+# MCP tool selection evaluation
+
+Evaluates MCP server tool selection. Phoenix used only for storing results and visualization.
+
+## CI Workflow
+
+The evaluation workflow runs automatically on:
+- **Master branch pushes** - for production evaluations (saves CI cycles)
+- **PRs with `validated` label** - for testing evaluation changes before merging
+
+To trigger evaluations on a PR, add the `validated` label to your pull request.
+
+## Two evaluation methods
+
+1. **exact match** (`tool-exact-match`) - binary tool name validation
+2. **LLM judge** (`tool-selection-llm`) - Phoenix classifier with structured prompt
+
+## Why OpenRouter?
+
+unified API for Gemini, Claude, GPT. no separate integrations needed.
+
+## Judge model
+
+- model: `openai/gpt-4o-mini`
+- prompt: structured eval with context + tool definitions
+- output: "correct"/"incorrect" → 1.0/0.0 score (and explanation)
+
+## Config (`config.ts`)
+
+```typescript
+MODELS_TO_EVALUATE = ['openai/gpt-4o-mini', 'anthropic/claude-3.5-haiku', 'google/gemini-2.5-flash']
+PASS_THRESHOLD = 0.6
+TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini'
+```
+
+## Setup
+
+```bash
+export PHOENIX_BASE_URL="your_url"
+export PHOENIX_API_KEY="your_key"
+export OPENROUTER_API_KEY="your_key"
+export OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+npm ci
+npm run evals:create-dataset  # one-time
+npm run evals:run
+```
+
+## Test cases
+
+40+ cases across 7 tool categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs`
+
+## Output
+
+- Phoenix dashboard with detailed results
+- console: pass/fail per model + evaluator
+- exit code: 0 = success, 1 = failure
+
+## Updating test cases
+
+to add/modify test cases:
+1. edit `test-cases.json`
+2. run `npm run evals:create-dataset` to update Phoenix dataset
+3. run `npm run evals:run` to test changes
diff --git a/evals/config.ts b/evals/config.ts
@@ -0,0 +1,114 @@
+/**
+ * Configuration for Apify MCP Server evaluations.
+ */
+
+import { readFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+// Read version from test-cases.json
+function getTestCasesVersion(): string {
+    const currentFilename = fileURLToPath(import.meta.url);
+    const currentDirname = dirname(currentFilename);
+    const testCasesPath = join(currentDirname, 'test-cases.json');
+    const testCasesContent = readFileSync(testCasesPath, 'utf-8');
+    const testCases = JSON.parse(testCasesContent);
+    return testCases.version;
+}
+
+// Evaluator names
+export const EVALUATOR_NAMES = {
+    TOOLS_EXACT_MATCH: 'tool-exact-match',
+    TOOL_SELECTION_LLM: 'tool-selection-llm',
+} as const;
+
+export type EvaluatorName = typeof EVALUATOR_NAMES[keyof typeof EVALUATOR_NAMES];
+
+// Models to evaluate
+export const MODELS_TO_EVALUATE = [
+    'openai/gpt-4o-mini',
+    'anthropic/claude-3.5-haiku',
+    'google/gemini-2.5-flash',
+];
+
+export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';
+
+export const PASS_THRESHOLD = 0.6;
+
+export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
+
+// System prompt
+export const SYSTEM_PROMPT = 'You are a helpful assistant';
+
+export const TOOL_CALLING_BASE_TEMPLATE = `
+You are an evaluation assistant evaluating user queries and tool calls to
+determine whether a tool was chosen and if it was a right tool.
+
+The tool calls have been generated by a separate agent, and chosen from the list of
+tools provided below. It is your job to decide whether that agent chose
+the right tool to call.
+
+[BEGIN DATA]
+************
+{{context}}
+{{query}}
+************
+{{tool_calls}}
+{{llm_response}}
+************
+[END DATA]
+
+DECISION: [correct or incorrect]
+EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
+
+Your response must be single word, either "correct" or "incorrect",
+and should not contain any text or characters aside from that word.
+
+"correct" means the correct tool call was chosen, the correct parameters
+were extracted from the query, the tool call generated is runnable and correct,
+and that no outside information not present in the query was used
+in the generated query.
+
+"incorrect" means that the chosen tool was not correct
+or that the tool signature includes parameter values that don't match
+the formats specified in the tool signatures below.
+
+You must not use any outside information or make assumptions.
+Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
+the [Tool Definitions], and the [Reference instructions] (if provided).
+Reference instructions are optional and are intended to help you understand the use case and make your decision.
+
+{{reference}}
+
+{{tool_definitions}}
+`
+export function getRequiredEnvVars(): Record<string, string | undefined> {
+    return {
+        PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
+        PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
+        OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY,
+        OPENROUTER_BASE_URL: process.env.OPENROUTER_BASE_URL,
+    };
+}
+
+// Removes newlines and trims whitespace. Useful for Authorization header values
+// because CI secrets sometimes include trailing newlines or quotes.
+export function sanitizeHeaderValue(value?: string): string | undefined {
+    if (value == null) return value;
+    return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
+}
+
+export function validateEnvVars(): boolean {
+    const envVars = getRequiredEnvVars();
+    const missing = Object.entries(envVars)
+        .filter(([, value]) => !value)
+        .map(([key]) => key);
+
+    if (missing.length > 0) {
+        // eslint-disable-next-line no-console
+        console.error(`Missing required environment variables: ${missing.join(', ')}`);
+        return false;
+    }
+
+    return true;
+}
diff --git a/evals/create-dataset.ts b/evals/create-dataset.ts
@@ -0,0 +1,108 @@
+#!/usr/bin/env tsx
+/**
+ * One-time script to create Phoenix dataset from test cases.
+ * Run this once to upload test cases to Phoenix platform and receive a dataset ID.
+ */
+
+import { readFileSync } from 'node:fs';
+import { dirname as pathDirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+import { createClient } from '@arizeai/phoenix-client';
+// eslint-disable-next-line import/extensions
+import { createDataset } from '@arizeai/phoenix-client/datasets';
+import dotenv from 'dotenv';
+
+import log from '@apify/log';
+
+import { sanitizeHeaderValue, validateEnvVars } from './config.js';
+
+// Set log level to debug
+log.setLevel(log.LEVELS.INFO);
+
+// Load environment variables from .env file if present
+dotenv.config({ path: '.env' });
+
+interface TestCase {
+    id: string;
+    category: string;
+    query: string;
+    context?: string;
+    expectedTools?: string[];
+    reference?: string;
+}
+
+interface TestData {
+    version: string;
+    testCases: TestCase[];
+}
+
+// eslint-disable-next-line consistent-return
+function loadTestCases(): TestData {
+    const filename = fileURLToPath(import.meta.url);
+    const dirname = pathDirname(filename);
+    const testCasesPath = join(dirname, 'test-cases.json');
+
+    try {
+        const fileContent = readFileSync(testCasesPath, 'utf-8');
+        return JSON.parse(fileContent) as TestData;
+    } catch {
+        log.error(`Error: Test cases file not found at ${testCasesPath}`);
+        process.exit(1);
+    }
+}
+
+async function createDatasetFromTestCases(): Promise<void> {
+    log.info('Creating Phoenix dataset from test cases...');
+
+    // Validate environment variables
+    if (!validateEnvVars()) {
+        process.exit(1);
+    }
+
+    // Load test cases
+    const testData = loadTestCases();
+    const { testCases } = testData;
+
+    log.info(`Loaded ${testCases.length} test cases`);
+
+    // Convert to format expected by Phoenix
+    const examples = testCases.map((testCase) => ({
+        input: { query: testCase.query },
+        output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' },
+        metadata: { category: testCase.category },
+    }));
+
+    // Initialize Phoenix client
+    const client = createClient({
+        options: {
+            baseUrl: process.env.PHOENIX_BASE_URL!,
+            headers: { Authorization: `Bearer ${sanitizeHeaderValue(process.env.PHOENIX_API_KEY)}` },
+        },
+    });
+
+    // Upload dataset
+    const datasetName = `mcp_server_dataset_v${testData.version}`;
+
+    log.info(`Uploading dataset '${datasetName}' to Phoenix...`);
+
+    try {
+        const { datasetId } = await createDataset({
+            client,
+            name: datasetName,
+            description: `MCP server dataset: version ${testData.version}`,
+            examples,
+        });
+
+        log.info(`Dataset '${datasetName}' created with ID: ${datasetId}`);
+    } catch (error) {
+        log.error(`Error creating dataset: ${error}`);
+        process.exit(1);
+    }
+}
+
+// Run the script
+createDatasetFromTestCases().catch((error) => {
+    log.error('Unexpected error:', error);
+    process.exit(1);
+});