Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
0af403f
fix: add evals
jirispilka Oct 13, 2025
a7306e0
fix: add evals
jirispilka Oct 13, 2025
43e8b5a
fix: add evals
jirispilka Oct 13, 2025
fd232da
fix: add docs
jirispilka Oct 13, 2025
21444a3
fix: update evaluations
jirispilka Oct 13, 2025
bc32f04
fix: lint
jirispilka Oct 13, 2025
1a0da38
fix: env variables
jirispilka Oct 13, 2025
14d4cc5
fix: fix uv
jirispilka Oct 13, 2025
20eda85
fix: fix uv
jirispilka Oct 13, 2025
c688ecb
fix: fix uv
jirispilka Oct 13, 2025
64bce1e
fix: Update results
jirispilka Oct 13, 2025
cf1054b
feat: Add typescript code
jirispilka Oct 14, 2025
54244b9
feat: Add run-evaluation.ts
jirispilka Oct 14, 2025
c9d9aeb
fix: lint
jirispilka Oct 14, 2025
5194a4a
fix: update create-dataset.ts with logs
jirispilka Oct 14, 2025
f2619f2
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
1353e8b
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
e766cab
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
8764c63
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
fbbe4c8
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
45db30d
fix: update run-evaluation.ts
jirispilka Oct 14, 2025
483ffd5
fix: update documentation
jirispilka Oct 14, 2025
120fc18
fix: update tsconfig.json
jirispilka Oct 14, 2025
dcb5e61
fix: update evaluations.yaml
jirispilka Oct 14, 2025
5a16f53
fix: add PHOENIX_BASE_URL
jirispilka Oct 14, 2025
40bdeb4
fix: run-again
jirispilka Oct 14, 2025
c608779
fix: add debug log
jirispilka Oct 14, 2025
a3f04aa
fix: add function to sanitize headers
jirispilka Oct 14, 2025
abe73f5
fix: evaluation and lint
jirispilka Oct 14, 2025
e40feb1
fix: update tools_exact_match
jirispilka Oct 16, 2025
5ed9eb8
fix: update run-evaluation.ts with llm as judge
jirispilka Oct 16, 2025
86c45df
fix: update logs and rename for clarity
jirispilka Oct 16, 2025
4b3db00
fix: update prompt
jirispilka Oct 16, 2025
e27f07b
fix: improve evaluation results
jirispilka Oct 16, 2025
a763934
fix: Use openrouter as llm judge
jirispilka Oct 16, 2025
529c334
fix: Organize packages
jirispilka Oct 16, 2025
1fedb52
Clean up Python cache files and update .gitignore
jirispilka Oct 16, 2025
0684a2a
Remove unnecessary __init__.py from evals directory
jirispilka Oct 16, 2025
0cd6642
fix: evals ci
jirispilka Oct 16, 2025
cb26e7e
fix: create dataset
jirispilka Oct 16, 2025
00ca260
fix: decrease threshold to get green light
jirispilka Oct 16, 2025
f1e9256
fix: fix eslint config
jirispilka Oct 16, 2025
9db9c24
fix: update README.md
jirispilka Oct 16, 2025
d118273
fix: run on push to master or evals tag
jirispilka Oct 16, 2025
4794ca4
fix: run on push to master or validated tag
jirispilka Oct 16, 2025
82df1ef
fix: value interpolation in the template! It was not working and fail…
jirispilka Oct 16, 2025
ab61d1b
fix: minor changes and a couple of more test cases
jirispilka Oct 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
APIFY_TOKEN=
# ANTHROPIC_API_KEY is only required when you want to run examples/clientStdioChat.js
ANTHROPIC_API_KEY=

# EVALS
OPENAI_API_KEY=
PHOENIX_API_KEY=
PHOENIX_HOST=

46 changes: 46 additions & 0 deletions .github/workflows/evaluations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# This workflow runs MCP tool calling evaluations on master branch merges
# It evaluates AI models' ability to correctly identify and call MCP tools.

name: MCP tool calling evaluations

on:
# Run evaluations on master branch merges
push:
branches:
- 'feat/evaluations'
# paths-ignore:
# - '**.md'
# - 'docs/**'
# - '.gitignore'
# - '.dockerignore'
# - 'LICENSE'

jobs:
evaluations:
name: MCP tool calling evaluations
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Use Node.js 22
uses: actions/setup-node@v4
with:
node-version: 22
cache: 'npm'
cache-dependency-path: 'package-lock.json'

- name: Install Node dependencies
run: npm ci

- name: Build project
run: npm run build

- name: Run evaluations
run: npm run evals:run
env:
PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }}
PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
2 changes: 1 addition & 1 deletion eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import apifyTypeScriptConfig from '@apify/eslint-config/ts.js';

// eslint-disable-next-line import/no-default-export
export default [
{ ignores: ['**/dist'] }, // Ignores need to happen first
{ ignores: ['**/dist', '**/.venv', 'evals/**'] }, // Ignores need to happen first
...apifyTypeScriptConfig,
{
languageOptions: {
Expand Down
111 changes: 111 additions & 0 deletions evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# MCP Tool Calling Evaluations

TypeScript-based evaluations for the Apify MCP Server using Arize Phoenix platform.

## Objectives

The MCP server tool calls evaluation has several key objectives:

1. **Identify problems** in the description of the tools
2. **Create a test suite** that can be run manually or automatically in CI
3. **Allow for quick iteration** on tool descriptions

## 1. ✍️ **Create test cases manually**

- **Pros:**
- Straightforward approach
- Simple to create test cases for each tool
- Direct control over test scenarios

- **Cons:**
- Complicated to create flows (several tool calls in a row)
- Requires maintenance when MCP server changes
- Manual effort for comprehensive coverage

## Test case examples

### Simple tool selection
```
"What are the best Instagram scrapers" → "search-actors"
```

### Multi-step flow
```
User: "Search for the weather MCP server and then add it to available tools"
Expected sequence:
1. search-actors (with input: {"search": "weather mcp", "limit": 5})
2. add-actor (to add the found weather MCP server)
```

## Workflow

The evaluation process has two steps:

1. **Create dataset** (if not exists) - Upload test cases to Phoenix
2. **Run evaluation** - Test models against ground truth

## Quick start

```bash
# 1. Set environment variables
export PHOENIX_BASE_URL="phoenix_base_url"
export PHOENIX_API_KEY="your_key"
export OPENAI_API_KEY="your_key"
export ANTHROPIC_API_KEY="your_key"

# 2. Install dependencies
npm ci

# 3. Create dataset (one-time)
npm run evals:create-dataset

# 5. Run evaluation
npm run evals:run
```

## Files

- `config.ts` - Configuration (models, threshold, Phoenix settings)
- `test-cases.json` - Ground truth test cases
- `run-evaluation.ts` - Main evaluation script
- `create-dataset.ts` - Upload test cases to Phoenix
- `evaluation_2025.ipynb` - Interactive analysis notebook (Python-based, requires `pip install -e .`)

## Configuration

Key settings in `config.ts`:
- `MODELS_TO_EVALUATE` - Models to test (default: `['gpt-4o-mini', 'claude-3-5-haiku-latest']`)
- `PASS_THRESHOLD` - Accuracy threshold (default: 0.8)
- `DATASET_NAME` - Phoenix dataset name

## Test cases

40+ test cases covering 7 tool categories:
- `fetch-actor-details` - Actor information queries
- `search-actors` - Actor discovery
- `apify-slash-rag-web-browser` - Web browsing
- `search-apify-docs` - Documentation search
- `call-actor` - Actor execution
- `get-actor-output` - Dataset retrieval
- `fetch-apify-docs` - Specific docs fetching

## Results

- **Phoenix Dashboard**: Detailed experiment results
- **Console Output**: Pass/fail status with threshold check
- **Exit Code**: 0 for success, 1 for failure (CI/CD ready)

## Troubleshooting

```bash
# Missing dataset
npm run evals:create-dataset

# Environment issues
# Make sure .env file exists with required API keys
```

## Adding test cases

1. Edit `test-cases.json`
3. Run `npm run evals:create-dataset`
Empty file added evals/__init__.py
Empty file.
Binary file added evals/__pycache__/config.cpython-312.pyc
Binary file not shown.
Binary file added evals/__pycache__/run_evaluation.cpython-311.pyc
Binary file not shown.
92 changes: 92 additions & 0 deletions evals/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/**
* Configuration for Apify MCP Server evaluations.
*/

import { readFileSync } from 'node:fs';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';

// Read version from test-cases.json
function getTestCasesVersion(): string {
const currentFilename = fileURLToPath(import.meta.url);
const currentDirname = dirname(currentFilename);
const testCasesPath = join(currentDirname, 'test-cases.json');
const testCasesContent = readFileSync(testCasesPath, 'utf-8');
const testCases = JSON.parse(testCasesContent);
return testCases.version;
}

// Models to evaluate
export const MODELS_TO_EVALUATE = [
'gpt-4o-mini',
'claude-3-5-haiku-latest',
];

export const PASS_THRESHOLD = 0.8;

export const DATASET_NAME = `mcp_tool_calling_ground_truth_v${getTestCasesVersion()}`;

// System prompt
export const SYSTEM_PROMPT = 'You are a helpful assistant';

// Tool calling evaluation template
export const TOOL_CALLING_BASE_TEMPLATE = `
You are an evaluation assistant evaluating questions and tool calls to
determine whether the tool called would answer the question. The tool
calls have been generated by a separate agent, and chosen from the list of
tools provided below. It is your job to decide whether that agent chose
the right tool to call.

[BEGIN DATA]
************
[Question]: {question}
************
[Tool Called]: {tool_call}
[END DATA]

Your response must be single word, either "correct" or "incorrect",
and should not contain any text or characters aside from that word.
"incorrect" means that the chosen tool would not answer the question,
the tool includes information that is not presented in the question,
or that the tool signature includes parameter values that don't match
the formats specified in the tool signatures below.

"correct" means the correct tool call was chosen, the correct parameters
were extracted from the question, the tool call generated is runnable and correct,
and that no outside information not present in the question was used
in the generated question.

[Tool Definitions]: {tool_definitions}
`;

// Environment variables
export function getRequiredEnvVars(): Record<string, string | undefined> {
return {
PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
};
}

// Removes newlines and trims whitespace. Useful for Authorization header values
// because CI secrets sometimes include trailing newlines or quotes.
export function sanitizeHeaderValue(value?: string): string | undefined {
if (value == null) return value;
return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
}

export function validateEnvVars(): boolean {
const envVars = getRequiredEnvVars();
const missing = Object.entries(envVars)
.filter(([, value]) => !value)
.map(([key]) => key);

if (missing.length > 0) {
// eslint-disable-next-line no-console
console.error(`Missing required environment variables: ${missing.join(', ')}`);
return false;
}

return true;
}
106 changes: 106 additions & 0 deletions evals/create-dataset.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env tsx
/**
* One-time script to create Phoenix dataset from test cases.
* Run this once to upload test cases to Phoenix platform and receive a dataset ID.
*/

import { readFileSync } from 'node:fs';
import { dirname as pathDirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';

import { createClient } from '@arizeai/phoenix-client';
// eslint-disable-next-line import/extensions
import { createDataset } from '@arizeai/phoenix-client/datasets';
import dotenv from 'dotenv';

import log from '@apify/log';

import { sanitizeHeaderValue, validateEnvVars } from './config.js';

// Set log level to debug
log.setLevel(log.LEVELS.INFO);

// Load environment variables from .env file if present
dotenv.config({ path: '.env' });

interface TestCase {
id: string;
category: string;
question: string;
expectedTools: string[];
}

interface TestData {
version: string;
testCases: TestCase[];
}

// eslint-disable-next-line consistent-return
function loadTestCases(): TestData {
const filename = fileURLToPath(import.meta.url);
const dirname = pathDirname(filename);
const testCasesPath = join(dirname, 'test-cases.json');

try {
const fileContent = readFileSync(testCasesPath, 'utf-8');
return JSON.parse(fileContent) as TestData;
} catch {
log.error(`Error: Test cases file not found at ${testCasesPath}`);
process.exit(1);
}
}

async function createDatasetFromTestCases(): Promise<void> {
log.info('Creating Phoenix dataset from test cases...');

// Validate environment variables
if (!validateEnvVars()) {
process.exit(1);
}

// Load test cases
const testData = loadTestCases();
const { testCases } = testData;

log.info(`Loaded ${testCases.length} test cases`);

// Convert to format expected by Phoenix
const examples = testCases.map((testCase) => ({
input: { question: testCase.question },
output: { tool_calls: testCase.expectedTools.join(', ') },
metadata: { category: testCase.category },
}));

// Initialize Phoenix client
const client = createClient({
options: {
baseUrl: process.env.PHOENIX_BASE_URL!,
headers: { Authorization: `Bearer ${sanitizeHeaderValue(process.env.PHOENIX_API_KEY)}` },
},
});

// Upload dataset
const datasetName = `mcp_tool_calling_ground_truth_v${testData.version}`;

log.info(`Uploading dataset '${datasetName}' to Phoenix...`);

try {
const { datasetId } = await createDataset({
client,
name: datasetName,
description: `MCP tool calling ground truth dataset version ${testData.version}`,
examples,
});

log.info(`Dataset '${datasetName}' created with ID: ${datasetId}`);
} catch (error) {
log.error(`Error creating dataset: ${error}`);
process.exit(1);
}
}

// Run the script
createDatasetFromTestCases().catch((error) => {
log.error('Unexpected error:', error);
process.exit(1);
});
Loading