Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion scripts/accuracy/generateTestSummary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[
return toolCalls
.map((call) => {
const params = JSON.stringify(call.parameters, null, 2);
return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${call.toolName}</span>`;
const isOptional = "optional" in call && call.optional;
return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}</span>`;
})
.join(", ");
}
Expand Down
5 changes: 5 additions & 0 deletions tests/accuracy/createCollection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ describeAccuracyTests([
{
prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
expectedToolCalls: [
{
toolName: "list-databases",
parameters: {},
optional: true,
},
{
toolName: "list-collections",
parameters: {
Expand Down
12 changes: 12 additions & 0 deletions tests/accuracy/dropCollection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ describeAccuracyTests([
{
prompt: "Remove mflix.movies namespace from my cluster.",
expectedToolCalls: [
{
toolName: "list-databases",
parameters: {},
optional: true,
},
{
toolName: "list-collections",
parameters: {
database: "mflix",
},
optional: true,
},
{
toolName: "drop-collection",
parameters: {
Expand Down
8 changes: 7 additions & 1 deletion tests/accuracy/find.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ describeAccuracyTests([
limit: Matcher.anyValue,
sort: Matcher.anyValue,
},
optional: true,
},
{
toolName: "export",
Expand All @@ -137,14 +138,19 @@ describeAccuracyTests([
arguments: Matcher.anyOf(
Matcher.emptyObjectOrUndefined,
Matcher.value({
filter: Matcher.anyValue,
filter: Matcher.emptyObjectOrUndefined,
projection: Matcher.anyValue,
limit: Matcher.anyValue,
sort: Matcher.anyValue,
})
),
},
],
jsonExportFormat: Matcher.anyOf(
Matcher.undefined,
Matcher.value("relaxed"),
Matcher.value("canonical")
),
},
},
],
Expand Down
72 changes: 21 additions & 51 deletions tests/accuracy/getPerformanceAdvisor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,27 @@ const mockedTools = {
},
};

const listProjectsAndClustersToolCalls = [
{
toolName: "atlas-list-projects",
parameters: {},
optional: true,
},
{
toolName: "atlas-list-clusters",
parameters: {
projectId: "mflix",
},
optional: true,
},
];

describeAccuracyTests([
// Test for Suggested Indexes operation
{
prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?",
expectedToolCalls: [
{
toolName: "atlas-list-projects",
parameters: {},
},
{
toolName: "atlas-list-clusters",
parameters: {
projectId: "mflix",
},
},
...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
Expand All @@ -65,16 +71,7 @@ describeAccuracyTests([
{
prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster",
expectedToolCalls: [
{
toolName: "atlas-list-projects",
parameters: {},
},
{
toolName: "atlas-list-clusters",
parameters: {
projectId: "mflix",
},
},
...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
Expand All @@ -88,18 +85,9 @@ describeAccuracyTests([
},
// Test for Slow Query Logs operation
{
prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.",
prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025 (a date that is certainly in the past!).",
expectedToolCalls: [
{
toolName: "atlas-list-projects",
parameters: {},
},
{
toolName: "atlas-list-clusters",
parameters: {
projectId: "mflix",
},
},
...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
Expand All @@ -117,16 +105,7 @@ describeAccuracyTests([
{
prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster",
expectedToolCalls: [
{
toolName: "atlas-list-projects",
parameters: {},
},
{
toolName: "atlas-list-clusters",
parameters: {
projectId: "mflix",
},
},
...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
Expand All @@ -142,16 +121,7 @@ describeAccuracyTests([
{
prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster",
expectedToolCalls: [
{
toolName: "atlas-list-projects",
parameters: {},
},
{
toolName: "atlas-list-clusters",
parameters: {
projectId: "mflix",
},
},
...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
Expand Down
4 changes: 3 additions & 1 deletion tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ export interface LLMToolCall {
parameters: Record<string, unknown>;
}

export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {
optional?: boolean;
};

export const AccuracyRunStatus = {
Done: "done",
Expand Down
13 changes: 8 additions & 5 deletions tests/accuracy/sdk/accuracyScorer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy(
.sort((a, b) => b.score - a.score || a.index - b.index);

const bestMatch = candidates[0];
if (!bestMatch || bestMatch.score === 0) {
return 0; // No matching tool call found, return 0
if (bestMatch) {
checkedActualToolCallIndexes.add(bestMatch.index);
currentScore = Math.min(currentScore, bestMatch.score);
} else if (expectedCall.optional) {
// Optional expected tool call not found, but it's okay, continue
continue;
} else {
return 0; // Required expected tool call not found, return 0
}

checkedActualToolCallIndexes.add(bestMatch.index);
currentScore = Math.min(currentScore, bestMatch.score);
}

return currentScore;
Expand Down
Loading