apify · jirispilka · Oct 16, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.env.example b/.env.example
@@ -1,3 +1,9 @@
 APIFY_TOKEN=
 # ANTHROPIC_API_KEY is only required when you want to run examples/clientStdioChat.js
 ANTHROPIC_API_KEY=
+
+# EVALS
+OPENAI_API_KEY=
+PHOENIX_API_KEY=
+PHOENIX_HOST=
+
diff --git a/.github/workflows/evaluations.yaml b/.github/workflows/evaluations.yaml
@@ -0,0 +1,46 @@
+# This workflow runs MCP tool calling evaluations on master branch merges
+# It evaluates AI models' ability to correctly identify and call MCP tools.
+
+name: MCP tool calling evaluations
+
+on:
+    # Run evaluations on master branch merges
+    push:
+        branches:
+            - 'feat/evaluations'
+#        paths-ignore:
+#            - '**.md'
+#            - 'docs/**'
+#            - '.gitignore'
+#            - '.dockerignore'
+#            - 'LICENSE'
+
+jobs:
+    evaluations:
+        name: MCP tool calling evaluations
+        runs-on: ubuntu-latest
+
+        steps:
+            -   name: Checkout code
+                uses: actions/checkout@v4
+
+            -   name: Use Node.js 22
+                uses: actions/setup-node@v4
+                with:
+                    node-version: 22
+                    cache: 'npm'
+                    cache-dependency-path: 'package-lock.json'
+
+            -   name: Install Node dependencies
+                run: npm ci
+
+            -   name: Build project
+                run: npm run build
+
+            -   name: Run evaluations
+                run: npm run evals:run
+                env:
+                    PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
+                    PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
+                    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+                    ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
diff --git a/eslint.config.mjs b/eslint.config.mjs
@@ -2,7 +2,7 @@ import apifyTypeScriptConfig from '@apify/eslint-config/ts.js';
 
 // eslint-disable-next-line import/no-default-export
 export default [
-    { ignores: ['**/dist'] }, // Ignores need to happen first
+    { ignores: ['**/dist', '**/.venv', 'evals/**'] }, // Ignores need to happen first
     ...apifyTypeScriptConfig,
     {
         languageOptions: {

diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,111 @@
+# MCP Tool Calling Evaluations
+
+TypeScript-based evaluations for the Apify MCP Server using Arize Phoenix platform.
+
+## Objectives
+
+The MCP server tool calls evaluation has several key objectives:
+
+1. **Identify problems** in the description of the tools
+2. **Create a test suite** that can be run manually or automatically in CI
+3. **Allow for quick iteration** on tool descriptions
+
+## 1. ✍️ **Create test cases manually**
+
+- **Pros:**
+  - Straightforward approach
+  - Simple to create test cases for each tool
+  - Direct control over test scenarios
+
+- **Cons:**
+  - Complicated to create flows (several tool calls in a row)
+  - Requires maintenance when MCP server changes
+  - Manual effort for comprehensive coverage
+
+## Test case examples
+
+### Simple tool selection
+```
+"What are the best Instagram scrapers" → "search-actors"
+```
+
+### Multi-step flow
+```
+User: "Search for the weather MCP server and then add it to available tools"
+Expected sequence:
+1. search-actors (with input: {"search": "weather mcp", "limit": 5})
+2. add-actor (to add the found weather MCP server)
+```
+
+## Workflow
+
+The evaluation process has two steps:
+
+1. **Create dataset** (if not exists) - Upload test cases to Phoenix
+2. **Run evaluation** - Test models against ground truth
+
+## Quick start
+
+```bash
+# 1. Set environment variables
+export PHOENIX_BASE_URL="phoenix_base_url"
+export PHOENIX_API_KEY="your_key"
+export OPENAI_API_KEY="your_key"
+export ANTHROPIC_API_KEY="your_key"
+
+# 2. Install dependencies
+npm ci
+
+# 3. Create dataset (one-time)
+npm run evals:create-dataset
+
+# 5. Run evaluation
+npm run evals:run
+```
+
+## Files
+
+- `config.ts` - Configuration (models, threshold, Phoenix settings)
+- `test-cases.json` - Ground truth test cases
+- `run-evaluation.ts` - Main evaluation script
+- `create-dataset.ts` - Upload test cases to Phoenix
+- `evaluation_2025.ipynb` - Interactive analysis notebook (Python-based, requires `pip install -e .`)
+
+## Configuration
+
+Key settings in `config.ts`:
+- `MODELS_TO_EVALUATE` - Models to test (default: `['gpt-4o-mini', 'claude-3-5-haiku-latest']`)
+- `PASS_THRESHOLD` - Accuracy threshold (default: 0.8)
+- `DATASET_NAME` - Phoenix dataset name
+
+## Test cases
+
+40+ test cases covering 7 tool categories:
+- `fetch-actor-details` - Actor information queries
+- `search-actors` - Actor discovery
+- `apify-slash-rag-web-browser` - Web browsing
+- `search-apify-docs` - Documentation search
+- `call-actor` - Actor execution
+- `get-actor-output` - Dataset retrieval
+- `fetch-apify-docs` - Specific docs fetching
+
+## Results
+
+- **Phoenix Dashboard**: Detailed experiment results
+- **Console Output**: Pass/fail status with threshold check
+- **Exit Code**: 0 for success, 1 for failure (CI/CD ready)
+
+## Troubleshooting
+
+```bash
+# Missing dataset
+npm run evals:create-dataset
+
+# Environment issues
+# Make sure .env file exists with required API keys
+```
+
+## Adding test cases
+
+1. Edit `test-cases.json`
+3. Run `npm run evals:create-dataset`
diff --git a/evals/__init__.py b/evals/__init__.py
diff --git a/evals/__pycache__/config.cpython-312.pyc b/evals/__pycache__/config.cpython-312.pyc
diff --git a/evals/__pycache__/run_evaluation.cpython-311.pyc b/evals/__pycache__/run_evaluation.cpython-311.pyc
diff --git a/evals/config.ts b/evals/config.ts
@@ -0,0 +1,92 @@
+/**
+ * Configuration for Apify MCP Server evaluations.
+ */
+
+import { readFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+// Read version from test-cases.json
+function getTestCasesVersion(): string {
+    const currentFilename = fileURLToPath(import.meta.url);
+    const currentDirname = dirname(currentFilename);
+    const testCasesPath = join(currentDirname, 'test-cases.json');
+    const testCasesContent = readFileSync(testCasesPath, 'utf-8');
+    const testCases = JSON.parse(testCasesContent);
+    return testCases.version;
+}
+
+// Models to evaluate
+export const MODELS_TO_EVALUATE = [
+    'gpt-4o-mini',
+    'claude-3-5-haiku-latest',
+];
+
+export const PASS_THRESHOLD = 0.8;
+
+export const DATASET_NAME = `mcp_tool_calling_ground_truth_v${getTestCasesVersion()}`;
+
+// System prompt
+export const SYSTEM_PROMPT = 'You are a helpful assistant';
+
+// Tool calling evaluation template
+export const TOOL_CALLING_BASE_TEMPLATE = `
+You are an evaluation assistant evaluating questions and tool calls to
+determine whether the tool called would answer the question. The tool
+calls have been generated by a separate agent, and chosen from the list of
+tools provided below. It is your job to decide whether that agent chose
+the right tool to call.
+
+    [BEGIN DATA]
+    ************
+    [Question]: {question}
+    ************
+    [Tool Called]: {tool_call}
+    [END DATA]
+
+Your response must be single word, either "correct" or "incorrect",
+and should not contain any text or characters aside from that word.
+"incorrect" means that the chosen tool would not answer the question,
+the tool includes information that is not presented in the question,
+or that the tool signature includes parameter values that don't match
+the formats specified in the tool signatures below.
+
+"correct" means the correct tool call was chosen, the correct parameters
+were extracted from the question, the tool call generated is runnable and correct,
+and that no outside information not present in the question was used
+in the generated question.
+
+[Tool Definitions]: {tool_definitions}
+`;
+
+// Environment variables
+export function getRequiredEnvVars(): Record<string, string | undefined> {
+    return {
+        PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
+        PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
+        OPENAI_API_KEY: process.env.OPENAI_API_KEY,
+        ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
+    };
+}
+
+// Removes newlines and trims whitespace. Useful for Authorization header values
+// because CI secrets sometimes include trailing newlines or quotes.
+export function sanitizeHeaderValue(value?: string): string | undefined {
+    if (value == null) return value;
+    return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
+}
+
+export function validateEnvVars(): boolean {
+    const envVars = getRequiredEnvVars();
+    const missing = Object.entries(envVars)
+        .filter(([, value]) => !value)
+        .map(([key]) => key);
+
+    if (missing.length > 0) {
+        // eslint-disable-next-line no-console
+        console.error(`Missing required environment variables: ${missing.join(', ')}`);
+        return false;
+    }
+
+    return true;
+}
diff --git a/evals/create-dataset.ts b/evals/create-dataset.ts
@@ -0,0 +1,106 @@
+#!/usr/bin/env tsx
+/**
+ * One-time script to create Phoenix dataset from test cases.
+ * Run this once to upload test cases to Phoenix platform and receive a dataset ID.
+ */
+
+import { readFileSync } from 'node:fs';
+import { dirname as pathDirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+import { createClient } from '@arizeai/phoenix-client';
+// eslint-disable-next-line import/extensions
+import { createDataset } from '@arizeai/phoenix-client/datasets';
+import dotenv from 'dotenv';
+
+import log from '@apify/log';
+
+import { sanitizeHeaderValue, validateEnvVars } from './config.js';
+
+// Set log level to debug
+log.setLevel(log.LEVELS.INFO);
+
+// Load environment variables from .env file if present
+dotenv.config({ path: '.env' });
+
+interface TestCase {
+    id: string;
+    category: string;
+    question: string;
+    expectedTools: string[];
+}
+
+interface TestData {
+    version: string;
+    testCases: TestCase[];
+}
+
+// eslint-disable-next-line consistent-return
+function loadTestCases(): TestData {
+    const filename = fileURLToPath(import.meta.url);
+    const dirname = pathDirname(filename);
+    const testCasesPath = join(dirname, 'test-cases.json');
+
+    try {
+        const fileContent = readFileSync(testCasesPath, 'utf-8');
+        return JSON.parse(fileContent) as TestData;
+    } catch {
+        log.error(`Error: Test cases file not found at ${testCasesPath}`);
+        process.exit(1);
+    }
+}
+
+async function createDatasetFromTestCases(): Promise<void> {
+    log.info('Creating Phoenix dataset from test cases...');
+
+    // Validate environment variables
+    if (!validateEnvVars()) {
+        process.exit(1);
+    }
+
+    // Load test cases
+    const testData = loadTestCases();
+    const { testCases } = testData;
+
+    log.info(`Loaded ${testCases.length} test cases`);
+
+    // Convert to format expected by Phoenix
+    const examples = testCases.map((testCase) => ({
+        input: { question: testCase.question },
+        output: { tool_calls: testCase.expectedTools.join(', ') },
+        metadata: { category: testCase.category },
+    }));
+
+    // Initialize Phoenix client
+    const client = createClient({
+        options: {
+            baseUrl: process.env.PHOENIX_BASE_URL!,
+            headers: { Authorization: `Bearer ${sanitizeHeaderValue(process.env.PHOENIX_API_KEY)}` },
+        },
+    });
+
+    // Upload dataset
+    const datasetName = `mcp_tool_calling_ground_truth_v${testData.version}`;
+
+    log.info(`Uploading dataset '${datasetName}' to Phoenix...`);
+
+    try {
+        const { datasetId } = await createDataset({
+            client,
+            name: datasetName,
+            description: `MCP tool calling ground truth dataset version ${testData.version}`,
+            examples,
+        });
+
+        log.info(`Dataset '${datasetName}' created with ID: ${datasetId}`);
+    } catch (error) {
+        log.error(`Error creating dataset: ${error}`);
+        process.exit(1);
+    }
+}
+
+// Run the script
+createDatasetFromTestCases().catch((error) => {
+    log.error('Unexpected error:', error);
+    process.exit(1);
+});