Skip to content
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
0af403f
fix: add evals
jirispilka Oct 13, 2025
a7306e0
fix: add evals
jirispilka Oct 13, 2025
43e8b5a
fix: add evals
jirispilka Oct 13, 2025
fd232da
fix: add docs
jirispilka Oct 13, 2025
21444a3
fix: update evaluations
jirispilka Oct 13, 2025
bc32f04
fix: lint
jirispilka Oct 13, 2025
1a0da38
fix: env variables
jirispilka Oct 13, 2025
14d4cc5
fix: fix uv
jirispilka Oct 13, 2025
20eda85
fix: fix uv
jirispilka Oct 13, 2025
c688ecb
fix: fix uv
jirispilka Oct 13, 2025
64bce1e
fix: Update results
jirispilka Oct 13, 2025
cf1054b
feat: Add typescript code
jirispilka Oct 14, 2025
54244b9
feat: Add run-evaluation.ts
jirispilka Oct 14, 2025
c9d9aeb
fix: lint
jirispilka Oct 14, 2025
5194a4a
fix: update create-dataset.ts with logs
jirispilka Oct 14, 2025
f2619f2
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
1353e8b
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
e766cab
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
8764c63
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
fbbe4c8
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
45db30d
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
483ffd5
fix: update documentation
jirispilka Oct 14, 2025
120fc18
fix: update tsconfig.json
jirispilka Oct 14, 2025
dcb5e61
fix: update evaluations.yaml
jirispilka Oct 14, 2025
5a16f53
fix: add PHOENIX_BASE_URL
jirispilka Oct 14, 2025
40bdeb4
fix: run-again
jirispilka Oct 14, 2025
c608779
fix: add debug log
jirispilka Oct 14, 2025
a3f04aa
fix: add function to sanitize headers
jirispilka Oct 14, 2025
abe73f5
fix: evaluation and lint
jirispilka Oct 14, 2025
e40feb1
fix: update tools_exact_match
jirispilka Oct 16, 2025
5ed9eb8
fix: update run-evaluation.ts with llm as judge
jirispilka Oct 16, 2025
86c45df
fix: update logs and rename for clarity
jirispilka Oct 16, 2025
4b3db00
fix: update prompt
jirispilka Oct 16, 2025
e27f07b
fix: improve evaluation results
jirispilka Oct 16, 2025
a763934
fix: Use openrouter as llm judge
jirispilka Oct 16, 2025
529c334
fix: Organize packages
jirispilka Oct 16, 2025
1fedb52
Clean up Python cache files and update .gitignore
jirispilka Oct 16, 2025
0684a2a
Remove unnecessary __init__.py from evals directory
jirispilka Oct 16, 2025
0cd6642
fix: evals ci
jirispilka Oct 16, 2025
cb26e7e
fix: create dataset
jirispilka Oct 16, 2025
00ca260
fix: decrease threshold to get green light
jirispilka Oct 16, 2025
f1e9256
fix: fix eslint config
jirispilka Oct 16, 2025
9db9c24
fix: update README.md
jirispilka Oct 16, 2025
d118273
fix: run on push to master or evals tag
jirispilka Oct 16, 2025
4794ca4
fix: run on push to master or validated tag
jirispilka Oct 16, 2025
82df1ef
fix: value interpolation in the template! It was not working and fail…
jirispilka Oct 16, 2025
ab61d1b
fix: minor changes and a couple of more test cases
jirispilka Oct 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
APIFY_TOKEN=
# ANTHROPIC_API_KEY is only required when you want to run examples/clientStdioChat.js
ANTHROPIC_API_KEY=

# EVALS
PHOENIX_API_KEY=
PHOENIX_HOST=

OPENROUTER_API_KEY=
OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
46 changes: 46 additions & 0 deletions .github/workflows/evaluations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# This workflow runs MCP tool calling evaluations on master branch merges
# It evaluates AI models' ability to correctly identify and call MCP tools.

name: MCP tool calling evaluations

on:
# Run evaluations on master branch merges
push:
branches:
- 'master'
# Also run on PRs with 'evals' label for testing
pull_request:
types: [labeled, synchronize, reopened]

jobs:
evaluations:
name: MCP tool calling evaluations
runs-on: ubuntu-latest
# Run on master pushes or PRs with 'evals' label
if: github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'validated')

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Use Node.js 22
uses: actions/setup-node@v4
with:
node-version: 22
cache: 'npm'
cache-dependency-path: 'package-lock.json'

- name: Install Node dependencies
run: npm ci --include=dev

- name: Build project
run: npm run build

- name: Run evaluations
run: npm run evals:run
env:
GITHUB_PR_NUMBER: ${{ github.event_name == 'pull_request' && github.event.number || 'master' }}
PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
OPENROUTER_BASE_URL: ${{ secrets.OPENROUTER_BASE_URL }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,10 @@ key.pem

# Ignore MCP config for Opencode client
opencode.json

# Python cache files
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
2 changes: 1 addition & 1 deletion eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import apifyTypeScriptConfig from '@apify/eslint-config/ts.js';

// eslint-disable-next-line import/no-default-export
export default [
{ ignores: ['**/dist'] }, // Ignores need to happen first
{ ignores: ['**/dist', '**/.venv', 'evals/**'] }, // Ignores need to happen first
...apifyTypeScriptConfig,
{
languageOptions: {
Expand Down
64 changes: 64 additions & 0 deletions evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# MCP tool selection evaluation

Evaluates MCP server tool selection. Phoenix used only for storing results and visualization.

## CI Workflow

The evaluation workflow runs automatically on:
- **Master branch pushes** - for production evaluations (saves CI cycles)
- **PRs with `validated` label** - for testing evaluation changes before merging

To trigger evaluations on a PR, add the `validated` label to your pull request.

## Two evaluation methods

1. **exact match** (`tool-exact-match`) - binary tool name validation
2. **LLM judge** (`tool-selection-llm`) - Phoenix classifier with structured prompt

## Why OpenRouter?

unified API for Gemini, Claude, GPT. no separate integrations needed.

## Judge model

- model: `openai/gpt-4o-mini`
- prompt: structured eval with context + tool definitions
- output: "correct"/"incorrect" → 1.0/0.0 score (and explanation)

## Config (`config.ts`)

```typescript
MODELS_TO_EVALUATE = ['openai/gpt-4o-mini', 'anthropic/claude-3.5-haiku', 'google/gemini-2.5-flash']
PASS_THRESHOLD = 0.6
TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini'
```

## Setup

```bash
export PHOENIX_BASE_URL="your_url"
export PHOENIX_API_KEY="your_key"
export OPENROUTER_API_KEY="your_key"
export OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"

npm ci
npm run evals:create-dataset # one-time
npm run evals:run
```

## Test cases

40+ cases across 7 tool categories: `fetch-actor-details`, `search-actors`, `apify-slash-rag-web-browser`, `search-apify-docs`, `call-actor`, `get-actor-output`, `fetch-apify-docs`

## Output

- Phoenix dashboard with detailed results
- console: pass/fail per model + evaluator
- exit code: 0 = success, 1 = failure

## Updating test cases

to add/modify test cases:
1. edit `test-cases.json`
2. run `npm run evals:create-dataset` to update Phoenix dataset
3. run `npm run evals:run` to test changes
114 changes: 114 additions & 0 deletions evals/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* Configuration for Apify MCP Server evaluations.
*/

import { readFileSync } from 'node:fs';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';

// Read version from test-cases.json
function getTestCasesVersion(): string {
const currentFilename = fileURLToPath(import.meta.url);
const currentDirname = dirname(currentFilename);
const testCasesPath = join(currentDirname, 'test-cases.json');
const testCasesContent = readFileSync(testCasesPath, 'utf-8');
const testCases = JSON.parse(testCasesContent);
return testCases.version;
}

// Evaluator names
export const EVALUATOR_NAMES = {
TOOLS_EXACT_MATCH: 'tool-exact-match',
TOOL_SELECTION_LLM: 'tool-selection-llm',
} as const;

export type EvaluatorName = typeof EVALUATOR_NAMES[keyof typeof EVALUATOR_NAMES];

// Models to evaluate
export const MODELS_TO_EVALUATE = [
'openai/gpt-4o-mini',
'anthropic/claude-3.5-haiku',
'google/gemini-2.5-flash',
];

export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4o-mini';

export const PASS_THRESHOLD = 0.6;

export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;

// System prompt
export const SYSTEM_PROMPT = 'You are a helpful assistant';

export const TOOL_CALLING_BASE_TEMPLATE = `
You are an evaluation assistant evaluating user queries and tool calls to
determine whether a tool was chosen and if it was a right tool.

The tool calls have been generated by a separate agent, and chosen from the list of
tools provided below. It is your job to decide whether that agent chose
the right tool to call.

[BEGIN DATA]
************
{{context}}
{{query}}
************
{{tool_calls}}
{{llm_response}}
************
[END DATA]

DECISION: [correct or incorrect]
EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]

Your response must be single word, either "correct" or "incorrect",
and should not contain any text or characters aside from that word.

"correct" means the correct tool call was chosen, the correct parameters
were extracted from the query, the tool call generated is runnable and correct,
and that no outside information not present in the query was used
in the generated query.

"incorrect" means that the chosen tool was not correct
or that the tool signature includes parameter values that don't match
the formats specified in the tool signatures below.

You must not use any outside information or make assumptions.
Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
the [Tool Definitions], and the [Reference instructions] (if provided).
Reference instructions are optional and are intended to help you understand the use case and make your decision.

{{reference}}

{{tool_definitions}}
`
export function getRequiredEnvVars(): Record<string, string | undefined> {
return {
PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY,
OPENROUTER_BASE_URL: process.env.OPENROUTER_BASE_URL,
};
}

// Removes newlines and trims whitespace. Useful for Authorization header values
// because CI secrets sometimes include trailing newlines or quotes.
export function sanitizeHeaderValue(value?: string): string | undefined {
if (value == null) return value;
return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
}

export function validateEnvVars(): boolean {
const envVars = getRequiredEnvVars();
const missing = Object.entries(envVars)
.filter(([, value]) => !value)
.map(([key]) => key);

if (missing.length > 0) {
// eslint-disable-next-line no-console
console.error(`Missing required environment variables: ${missing.join(', ')}`);
return false;
}

return true;
}
108 changes: 108 additions & 0 deletions evals/create-dataset.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env tsx
/**
* One-time script to create Phoenix dataset from test cases.
* Run this once to upload test cases to Phoenix platform and receive a dataset ID.
*/

import { readFileSync } from 'node:fs';
import { dirname as pathDirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';

import { createClient } from '@arizeai/phoenix-client';
// eslint-disable-next-line import/extensions
import { createDataset } from '@arizeai/phoenix-client/datasets';
import dotenv from 'dotenv';

import log from '@apify/log';

import { sanitizeHeaderValue, validateEnvVars } from './config.js';

// Set log level to debug
log.setLevel(log.LEVELS.INFO);

// Load environment variables from .env file if present
dotenv.config({ path: '.env' });

interface TestCase {
id: string;
category: string;
query: string;
context?: string;
expectedTools?: string[];
reference?: string;
}

interface TestData {
version: string;
testCases: TestCase[];
}

// eslint-disable-next-line consistent-return
function loadTestCases(): TestData {
const filename = fileURLToPath(import.meta.url);
const dirname = pathDirname(filename);
const testCasesPath = join(dirname, 'test-cases.json');

try {
const fileContent = readFileSync(testCasesPath, 'utf-8');
return JSON.parse(fileContent) as TestData;
} catch {
log.error(`Error: Test cases file not found at ${testCasesPath}`);
process.exit(1);
}
}

async function createDatasetFromTestCases(): Promise<void> {
log.info('Creating Phoenix dataset from test cases...');

// Validate environment variables
if (!validateEnvVars()) {
process.exit(1);
}

// Load test cases
const testData = loadTestCases();
const { testCases } = testData;

log.info(`Loaded ${testCases.length} test cases`);

// Convert to format expected by Phoenix
const examples = testCases.map((testCase) => ({
input: { query: testCase.query },
output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' },
metadata: { category: testCase.category },
}));

// Initialize Phoenix client
const client = createClient({
options: {
baseUrl: process.env.PHOENIX_BASE_URL!,
headers: { Authorization: `Bearer ${sanitizeHeaderValue(process.env.PHOENIX_API_KEY)}` },
},
});

// Upload dataset
const datasetName = `mcp_server_dataset_v${testData.version}`;

log.info(`Uploading dataset '${datasetName}' to Phoenix...`);

try {
const { datasetId } = await createDataset({
client,
name: datasetName,
description: `MCP server dataset: version ${testData.version}`,
examples,
});

log.info(`Dataset '${datasetName}' created with ID: ${datasetId}`);
} catch (error) {
log.error(`Error creating dataset: ${error}`);
process.exit(1);
}
}

// Run the script
createDatasetFromTestCases().catch((error) => {
log.error('Unexpected error:', error);
process.exit(1);
});
Loading