Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4

- name: Initialize CodeQL
uses: github/codeql-action/init@4bdb89f48054571735e3792627da6195c57459e2 # v3
uses: github/codeql-action/init@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v3
with:
languages: ${{ matrix.language }}

- name: Setup .NET
if: matrix.language == 'csharp'
uses: actions/setup-dotnet@67a3573c9a986a3f9c594539f4ab511d57bb3ce9 # v4
uses: actions/setup-dotnet@baa11fbfe1d6520db94683bd5c7a3818018e4309 # v4
with:
dotnet-version: "10.0.x"

Expand All @@ -49,9 +49,9 @@ jobs:

- name: Autobuild
if: matrix.language != 'csharp'
uses: github/codeql-action/autobuild@4bdb89f48054571735e3792627da6195c57459e2 # v3
uses: github/codeql-action/autobuild@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v3

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@4bdb89f48054571735e3792627da6195c57459e2 # v3
uses: github/codeql-action/analyze@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v3
with:
category: "/language:${{ matrix.language }}"
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Set up QEMU
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0
Expand All @@ -40,7 +40,7 @@ jobs:

- name: Log in to the Container registry
if: github.event_name != 'pull_request'
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/scorecard.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:

steps:
- name: "Checkout code"
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

Expand Down Expand Up @@ -73,6 +73,6 @@ jobs:
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@cdefb33c0f6224e58673d9004f47f7cb3e328b89 # v4
uses: github/codeql-action/upload-sarif@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v4
with:
sarif_file: results.sarif
6 changes: 3 additions & 3 deletions catalog.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Build UI first in a Node.js container
FROM --platform=$BUILDPLATFORM node:25-bookworm@sha256:839caad0185604c2e602024686408cdbcc37f1d2825e54ea3900f4dad3310a07 AS ui-build
FROM --platform=$BUILDPLATFORM node:25-bookworm@sha256:e6b32434aba48dcb8730d56de2df3d137de213f1f527a922a6bf7b2853a24e86 AS ui-build
WORKDIR /ui
COPY ui .
RUN npm install
RUN npm run build

# create the build container
FROM --platform=$BUILDPLATFORM mcr.microsoft.com/dotnet/sdk:10.0@sha256:90f913c96383b4146ce45985fd97e723fa1b1b6359441c4b683240236052eb59 AS build
FROM --platform=$BUILDPLATFORM mcr.microsoft.com/dotnet/sdk:10.0@sha256:25d14b400b75fa4e89d5bd4487a92a604a4e409ab65becb91821e7dc4ac7f81f AS build
ARG TARGETARCH
LABEL stage=build
WORKDIR /api
Expand All @@ -16,7 +16,7 @@ COPY --from=ui-build /ui/dist/ ./wwwroot/
RUN dotnet publish -c Release -o out -a $TARGETARCH

# create the runtime container
FROM mcr.microsoft.com/dotnet/aspnet:10.0@sha256:cc9c8da871c6e367a63122b858b10cfc464f5687bcfcf9d3761bcff1188cf257
FROM mcr.microsoft.com/dotnet/aspnet:10.0@sha256:1aacc8154bc3071349907dae26849df301188be1a2e1f4560b903fb6275e481a
ARG INSTALL_AZURE_CLI=false
WORKDIR /app
COPY --from=build /api/out .
Expand Down
14 changes: 14 additions & 0 deletions catalog/.github/agents/ask-catalog.agent.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
description: "Ask questions about experiments using the catalog MCP tools."
tools: ["read", "experiment-catalog/*"]
---

This agent uses the experiment catalog MCP server to analyze experiments.

ALWAYS use this skill: [experiment-catalog](../skills/experiment-catalog/SKILL.md).

## Tool Selection

- When comparing a permutation (set) to the baseline, use `CompareExperiment` directly. Do not call `ListSetsForExperiment` first to validate the set name.
- Use `CompareByRef` only when the user asks about individual ground truth (ref) performance, such as which refs improved or regressed.
- Call each tool only when its output is needed. Avoid discovery or pre-check calls before comparison tools.
85 changes: 85 additions & 0 deletions catalog/.github/skills/experiment-catalog/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Experiment Catalog

A comprehensive tool for cataloging, comparing, and analyzing experiment results. The Experiment Catalog enables teams to track evaluation runs across projects, compare metrics against baselines, and identify performance regressions or improvements in AI/ML experimentation workflows.

## Overview

The experiment catalog organizes experimental data in a hierarchical structure:

| Level | Also Known As | Description |
| ---------- | ----------------- | ------------------------------------------------------------------------------ |
| Project | Sprint, Milestone | Fixed evaluation environment (baseline, ground truth, metrics) for experiments |
| Experiment | - | A hypothesis-driven test varying inference within a project |
| Set | Permutation | A configuration variant within an experiment |
| Result | - | All metric values for a single ground truth iteration |
| Ref | Ground Truth | Reference to the entity being evaluated, used for aggregation and comparison |

## Key Concepts

### Projects

A project represents a fixed evaluation environment in which experiments are conducted. The project establishes:

- Baseline measurements for comparison
- Ground truth data (often split into validation and test sets)
- Metric definitions and evaluation scripts
- Stable infrastructure configuration

Projects align with milestones or sprints. During a project, the evaluation tooling and data remain constant while developers vary inference approaches through experiments. Each project iteration produces a new version of the solution that can be measured against the previous version.

### Experiments

An experiment tests a specific hypothesis by varying inference parameters, code, or configuration. Experiments contain multiple evaluation runs (sets) to compare different approaches. The goal is to prove or disprove the hypothesis by comparing results against baselines.

### Baselines

Baselines provide measurement points for comparison:

| Baseline Type | Purpose |
| ------------------------- | ----------------------------------------------------------- |
| Project Baseline | Initial measurement before experimentation begins |
| Experiment Baseline | First run of an experiment before making changes |
| Final Experiment Baseline | Best configuration run on both validation and test sets |
| Final Project Baseline | End-of-project measurement to compare against project start |

When working with non-deterministic inference or evaluation systems, run baselines multiple times (commonly 5 iterations) to establish reliable averages.

### Sets and Refs

- **Set**: A collection of results from a single evaluation run. Running 5 iterations of 12 ground truths constitutes one set. Additional iterations can be added to an existing set.
- **Ref**: The catalog term for a ground truth. Every ground truth is stored and queried as a "ref" throughout the catalog API, MCP tools, and data model. When a user asks about ground truth performance, improvements, or regressions, translate "ground truth" to "ref" in all catalog operations. Refs enable aggregation across iterations and comparison of individual ground truth performance across evaluation runs.

### Iterations

An iteration is a single execution of inference and evaluation for a ground truth. Because AI agents and LLM-based systems are non-deterministic, running multiple iterations is essential:

- **Minimum recommendation**: At least 5 iterations per ground truth
- **Averaging**: Multiple iterations allow averaging results to account for variance in non-deterministic systems
- **Statistical analysis**: P-values and confidence intervals are calculated per ground truth, requiring multiple iterations to determine a reasonable range versus baseline

A result captures all metric values for one ground truth iteration. When a set contains 5 iterations of 12 ground truths, it stores 60 individual results (5 × 12).

## Experimentation Workflow

The recommended workflow follows these phases:

1. **Create a Project**: Establish the fixed evaluation environment
2. **Run a Project Baseline**: Measure initial state before experimentation
3. **Run Experiments**:
- Create an experiment with a hypothesis
- Run an experiment baseline (or accept the project baseline)
- Run permutations varying inference parameters
- Determine the best permutation
- Run a final experiment baseline on validation and test sets
- Write a summary documenting the experiment
- Review with your team
- Approve (merge) or reject
4. **Run a Final Project Baseline**: Measure end state after all experiments

## Determining Best Permutation

With many ground truths, differences between permutations are often minimal. Techniques for identifying the best approach:

- **Look at Subsets**: Subsets like "multi-turn" examples may show 20-30% differences where overall metrics show only 1% variance
- **Prioritize Metrics**: Rank metrics by importance and evaluate based on highest-priority metrics first
- **Statistical Significance**: Use p-value calculations to determine when metric changes are meaningful
1 change: 1 addition & 0 deletions catalog/.gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
wwwroot/
cache/
*.env
.copilot-tracking/

.vscode/*
!.vscode/settings.json
Expand Down
3 changes: 3 additions & 0 deletions catalog/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"microsoft-authentication.implementation": "msal-no-broker"
}
31 changes: 27 additions & 4 deletions catalog/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
using Microsoft.Extensions.Options;
using Microsoft.IdentityModel.Tokens;
using Microsoft.OpenApi;
using ModelContextProtocol;
using ModelContextProtocol.AspNetCore;
using ModelContextProtocol.AspNetCore.Authentication;
using ModelContextProtocol.Protocol;
using NetBricks;

// load environment variables from .env file
Expand Down Expand Up @@ -50,10 +54,19 @@
builder.Services.AddSingleton<IStorageService, AzureBlobStorageService>();
builder.Services.AddSingleton<ISupportDocsService, AzureBlobSupportDocsService>();
builder.Services.AddSingleton<CalculateStatisticsService>();
builder.Services.AddSingleton<AnalysisService>();
builder.Services.AddSingleton<ExperimentService>();
builder.Services.AddSingleton<ConcurrencyService>();
builder.Services.AddHostedService<AzureBlobStorageMaintenanceService>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<CalculateStatisticsService>());

// add MCP server with analysis tools
builder.Services
.AddMcpServer()
.WithHttpTransport()
.WithToolsFromAssembly()
.AddCallToolFilter(McpToolExceptionFilter.Create());

// add controllers with swagger
builder.Services.AddControllers().AddNewtonsoftJson();
builder.Services.AddEndpointsApiExplorer();
Expand All @@ -79,8 +92,14 @@

// add authentication with deferred configuration
builder.Services.AddSingleton<IConfigureOptions<JwtBearerOptions>, JwtBearerConfigurator>();
builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
.AddJwtBearer();
builder.Services.AddSingleton<IConfigureOptions<McpAuthenticationOptions>, McpAuthenticationConfigurator>();
builder.Services.AddAuthentication(options =>
{
options.DefaultAuthenticateScheme = JwtBearerDefaults.AuthenticationScheme;
options.DefaultChallengeScheme = McpAuthenticationDefaults.AuthenticationScheme;
})
.AddJwtBearer()
.AddMcp();
builder.Services.AddAuthorization();
builder.Services.AddSingleton<IConfigureOptions<AuthorizationOptions>, AuthorizationConfigurator>();

Expand All @@ -90,7 +109,10 @@
options.AddPolicy("default-policy",
corsBuilder =>
{
corsBuilder.WithOrigins("http://localhost:6020")
corsBuilder.WithOrigins(
"http://localhost:6020",
"http://localhost:6274" // MCP Inspector
)
.AllowAnyHeader()
.AllowAnyMethod()
.AllowCredentials();
Expand All @@ -115,8 +137,9 @@
app.UseAuthentication();
app.UseAuthorization();

// map controllers
// map controllers and MCP
app.MapControllers();
app.MapMcp("/mcp");

// run
app.Run();
52 changes: 52 additions & 0 deletions catalog/config/McpAuthenticationConfigurator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using System;
using System.Threading;
using Microsoft.Extensions.Options;
using ModelContextProtocol.AspNetCore.Authentication;
using ModelContextProtocol.Authentication;
using NetBricks;

namespace Catalog;

/// <summary>
/// Configures MCP authentication options using the application's OIDC settings.
/// </summary>
/// <remarks>
/// When authentication is enabled, this sets up the
/// <see cref="ProtectedResourceMetadata"/> so that MCP clients can discover
/// the OAuth authorization server and complete the OAuth 2.0 flow.
/// </remarks>
public class McpAuthenticationConfigurator(IConfigFactory<IConfig> configFactory)
: IConfigureNamedOptions<McpAuthenticationOptions>
{
/// <inheritdoc/>
public void Configure(string? name, McpAuthenticationOptions options)
{
if (name != McpAuthenticationDefaults.AuthenticationScheme)
{
return;
}
Configure(options);
}

/// <inheritdoc/>
public void Configure(McpAuthenticationOptions options)
{
var config = configFactory.GetAsync(CancellationToken.None).ConfigureAwait(false).GetAwaiter().GetResult();
if (!config.IsAuthenticationEnabled)
{
return;
}

// the API scope ensures Azure AD issues a JWT access token whose audience
// matches this API rather than an opaque Microsoft Graph token.
// OIDC_CLIENT_ID is required for MCP authentication to work with Azure AD.
options.ResourceMetadata = new ProtectedResourceMetadata
{
AuthorizationServers = { new Uri(config.OIDC_AUTHORITY!) }
};
if (!string.IsNullOrEmpty(config.OIDC_CLIENT_ID))
{
options.ResourceMetadata.ScopesSupported = [$"api://{config.OIDC_CLIENT_ID}/.default"];
}
}
}
Loading
Loading