plasne · plasne · Feb 11, 2026 · Feb 6, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -28,16 +28,16 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4
 
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@4bdb89f48054571735e3792627da6195c57459e2 # v3
+        uses: github/codeql-action/init@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v3
         with:
           languages: ${{ matrix.language }}
 
       - name: Setup .NET
         if: matrix.language == 'csharp'
-        uses: actions/setup-dotnet@67a3573c9a986a3f9c594539f4ab511d57bb3ce9 # v4
+        uses: actions/setup-dotnet@baa11fbfe1d6520db94683bd5c7a3818018e4309 # v4
         with:
           dotnet-version: "10.0.x"
 
@@ -49,9 +49,9 @@ jobs:
 
       - name: Autobuild
         if: matrix.language != 'csharp'
-        uses: github/codeql-action/autobuild@4bdb89f48054571735e3792627da6195c57459e2 # v3
+        uses: github/codeql-action/autobuild@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v3
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@4bdb89f48054571735e3792627da6195c57459e2 # v3
+        uses: github/codeql-action/analyze@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v3
         with:
           category: "/language:${{ matrix.language }}"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -30,7 +30,7 @@ jobs:
       packages: write
     steps:
       - name: Checkout repository
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0
@@ -40,7 +40,7 @@ jobs:
 
       - name: Log in to the Container registry
         if: github.event_name != 'pull_request'
-        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
         with:
           registry: ${{ env.REGISTRY }}
           username: ${{ github.actor }}

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
@@ -34,7 +34,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           persist-credentials: false
 
@@ -73,6 +73,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@cdefb33c0f6224e58673d9004f47f7cb3e328b89 # v4
+        uses: github/codeql-action/upload-sarif@b20883b0cd1f46c72ae0ba6d1090936928f9fa30 # v4
         with:
           sarif_file: results.sarif
diff --git a/catalog.Dockerfile b/catalog.Dockerfile
@@ -1,12 +1,12 @@
 # Build UI first in a Node.js container
-FROM --platform=$BUILDPLATFORM node:25-bookworm@sha256:839caad0185604c2e602024686408cdbcc37f1d2825e54ea3900f4dad3310a07 AS ui-build
+FROM --platform=$BUILDPLATFORM node:25-bookworm@sha256:e6b32434aba48dcb8730d56de2df3d137de213f1f527a922a6bf7b2853a24e86 AS ui-build
 WORKDIR /ui
 COPY ui .
 RUN npm install
 RUN npm run build
 
 # create the build container
-FROM --platform=$BUILDPLATFORM mcr.microsoft.com/dotnet/sdk:10.0@sha256:90f913c96383b4146ce45985fd97e723fa1b1b6359441c4b683240236052eb59 AS build
+FROM --platform=$BUILDPLATFORM mcr.microsoft.com/dotnet/sdk:10.0@sha256:25d14b400b75fa4e89d5bd4487a92a604a4e409ab65becb91821e7dc4ac7f81f AS build
 ARG TARGETARCH
 LABEL stage=build
 WORKDIR /api
@@ -16,7 +16,7 @@ COPY --from=ui-build /ui/dist/ ./wwwroot/
 RUN dotnet publish -c Release -o out -a $TARGETARCH
 
 # create the runtime container
-FROM mcr.microsoft.com/dotnet/aspnet:10.0@sha256:cc9c8da871c6e367a63122b858b10cfc464f5687bcfcf9d3761bcff1188cf257
+FROM mcr.microsoft.com/dotnet/aspnet:10.0@sha256:1aacc8154bc3071349907dae26849df301188be1a2e1f4560b903fb6275e481a
 ARG INSTALL_AZURE_CLI=false
 WORKDIR /app
 COPY --from=build /api/out .

diff --git a/catalog/.github/agents/ask-catalog.agent.md b/catalog/.github/agents/ask-catalog.agent.md
@@ -0,0 +1,14 @@
+---
+description: "Ask questions about experiments using the catalog MCP tools."
+tools: ["read", "experiment-catalog/*"]
+---
+
+This agent uses the experiment catalog MCP server to analyze experiments.
+
+ALWAYS use this skill: [experiment-catalog](../skills/experiment-catalog/SKILL.md).
+
+## Tool Selection
+
+- When comparing a permutation (set) to the baseline, use `CompareExperiment` directly. Do not call `ListSetsForExperiment` first to validate the set name.
+- Use `CompareByRef` only when the user asks about individual ground truth (ref) performance, such as which refs improved or regressed.
+- Call each tool only when its output is needed. Avoid discovery or pre-check calls before comparison tools.
diff --git a/catalog/.github/skills/experiment-catalog/SKILL.md b/catalog/.github/skills/experiment-catalog/SKILL.md
@@ -0,0 +1,85 @@
+# Experiment Catalog
+
+A comprehensive tool for cataloging, comparing, and analyzing experiment results. The Experiment Catalog enables teams to track evaluation runs across projects, compare metrics against baselines, and identify performance regressions or improvements in AI/ML experimentation workflows.
+
+## Overview
+
+The experiment catalog organizes experimental data in a hierarchical structure:
+
+| Level      | Also Known As     | Description                                                                    |
+| ---------- | ----------------- | ------------------------------------------------------------------------------ |
+| Project    | Sprint, Milestone | Fixed evaluation environment (baseline, ground truth, metrics) for experiments |
+| Experiment | -                 | A hypothesis-driven test varying inference within a project                    |
+| Set        | Permutation       | A configuration variant within an experiment                                   |
+| Result     | -                 | All metric values for a single ground truth iteration                          |
+| Ref        | Ground Truth      | Reference to the entity being evaluated, used for aggregation and comparison   |
+
+## Key Concepts
+
+### Projects
+
+A project represents a fixed evaluation environment in which experiments are conducted. The project establishes:
+
+- Baseline measurements for comparison
+- Ground truth data (often split into validation and test sets)
+- Metric definitions and evaluation scripts
+- Stable infrastructure configuration
+
+Projects align with milestones or sprints. During a project, the evaluation tooling and data remain constant while developers vary inference approaches through experiments. Each project iteration produces a new version of the solution that can be measured against the previous version.
+
+### Experiments
+
+An experiment tests a specific hypothesis by varying inference parameters, code, or configuration. Experiments contain multiple evaluation runs (sets) to compare different approaches. The goal is to prove or disprove the hypothesis by comparing results against baselines.
+
+### Baselines
+
+Baselines provide measurement points for comparison:
+
+| Baseline Type             | Purpose                                                     |
+| ------------------------- | ----------------------------------------------------------- |
+| Project Baseline          | Initial measurement before experimentation begins           |
+| Experiment Baseline       | First run of an experiment before making changes            |
+| Final Experiment Baseline | Best configuration run on both validation and test sets     |
+| Final Project Baseline    | End-of-project measurement to compare against project start |
+
+When working with non-deterministic inference or evaluation systems, run baselines multiple times (commonly 5 iterations) to establish reliable averages.
+
+### Sets and Refs
+
+- **Set**: A collection of results from a single evaluation run. Running 5 iterations of 12 ground truths constitutes one set. Additional iterations can be added to an existing set.
+- **Ref**: The catalog term for a ground truth. Every ground truth is stored and queried as a "ref" throughout the catalog API, MCP tools, and data model. When a user asks about ground truth performance, improvements, or regressions, translate "ground truth" to "ref" in all catalog operations. Refs enable aggregation across iterations and comparison of individual ground truth performance across evaluation runs.
+
+### Iterations
+
+An iteration is a single execution of inference and evaluation for a ground truth. Because AI agents and LLM-based systems are non-deterministic, running multiple iterations is essential:
+
+- **Minimum recommendation**: At least 5 iterations per ground truth
+- **Averaging**: Multiple iterations allow averaging results to account for variance in non-deterministic systems
+- **Statistical analysis**: P-values and confidence intervals are calculated per ground truth, requiring multiple iterations to determine a reasonable range versus baseline
+
+A result captures all metric values for one ground truth iteration. When a set contains 5 iterations of 12 ground truths, it stores 60 individual results (5 × 12).
+
+## Experimentation Workflow
+
+The recommended workflow follows these phases:
+
+1. **Create a Project**: Establish the fixed evaluation environment
+2. **Run a Project Baseline**: Measure initial state before experimentation
+3. **Run Experiments**:
+   - Create an experiment with a hypothesis
+   - Run an experiment baseline (or accept the project baseline)
+   - Run permutations varying inference parameters
+   - Determine the best permutation
+   - Run a final experiment baseline on validation and test sets
+   - Write a summary documenting the experiment
+   - Review with your team
+   - Approve (merge) or reject
+4. **Run a Final Project Baseline**: Measure end state after all experiments
+
+## Determining Best Permutation
+
+With many ground truths, differences between permutations are often minimal. Techniques for identifying the best approach:
+
+- **Look at Subsets**: Subsets like "multi-turn" examples may show 20-30% differences where overall metrics show only 1% variance
+- **Prioritize Metrics**: Rank metrics by importance and evaluate based on highest-priority metrics first
+- **Statistical Significance**: Use p-value calculations to determine when metric changes are meaningful
diff --git a/catalog/.gitignore b/catalog/.gitignore
@@ -1,6 +1,7 @@
 wwwroot/
 cache/
 *.env
+.copilot-tracking/
 
 .vscode/*
 !.vscode/settings.json

diff --git a/catalog/.vscode/settings.json b/catalog/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "microsoft-authentication.implementation": "msal-no-broker"
+}
diff --git a/catalog/Program.cs b/catalog/Program.cs
@@ -13,6 +13,10 @@
 using Microsoft.Extensions.Options;
 using Microsoft.IdentityModel.Tokens;
 using Microsoft.OpenApi;
+using ModelContextProtocol;
+using ModelContextProtocol.AspNetCore;
+using ModelContextProtocol.AspNetCore.Authentication;
+using ModelContextProtocol.Protocol;
 using NetBricks;
 
 // load environment variables from .env file
@@ -50,10 +54,19 @@
 builder.Services.AddSingleton<IStorageService, AzureBlobStorageService>();
 builder.Services.AddSingleton<ISupportDocsService, AzureBlobSupportDocsService>();
 builder.Services.AddSingleton<CalculateStatisticsService>();
+builder.Services.AddSingleton<AnalysisService>();
+builder.Services.AddSingleton<ExperimentService>();
 builder.Services.AddSingleton<ConcurrencyService>();
 builder.Services.AddHostedService<AzureBlobStorageMaintenanceService>();
 builder.Services.AddHostedService(sp => sp.GetRequiredService<CalculateStatisticsService>());
 
+// add MCP server with analysis tools
+builder.Services
+    .AddMcpServer()
+    .WithHttpTransport()
+    .WithToolsFromAssembly()
+    .AddCallToolFilter(McpToolExceptionFilter.Create());
+
 // add controllers with swagger
 builder.Services.AddControllers().AddNewtonsoftJson();
 builder.Services.AddEndpointsApiExplorer();
@@ -79,8 +92,14 @@
 
 // add authentication with deferred configuration
 builder.Services.AddSingleton<IConfigureOptions<JwtBearerOptions>, JwtBearerConfigurator>();
-builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
-    .AddJwtBearer();
+builder.Services.AddSingleton<IConfigureOptions<McpAuthenticationOptions>, McpAuthenticationConfigurator>();
+builder.Services.AddAuthentication(options =>
+    {
+        options.DefaultAuthenticateScheme = JwtBearerDefaults.AuthenticationScheme;
+        options.DefaultChallengeScheme = McpAuthenticationDefaults.AuthenticationScheme;
+    })
+    .AddJwtBearer()
+    .AddMcp();
 builder.Services.AddAuthorization();
 builder.Services.AddSingleton<IConfigureOptions<AuthorizationOptions>, AuthorizationConfigurator>();
 
@@ -90,7 +109,10 @@
     options.AddPolicy("default-policy",
     corsBuilder =>
     {
-        corsBuilder.WithOrigins("http://localhost:6020")
+        corsBuilder.WithOrigins(
+                "http://localhost:6020",
+                "http://localhost:6274"  // MCP Inspector
+            )
                .AllowAnyHeader()
                .AllowAnyMethod()
                .AllowCredentials();
@@ -115,8 +137,9 @@
 app.UseAuthentication();
 app.UseAuthorization();
 
-// map controllers
+// map controllers and MCP
 app.MapControllers();
+app.MapMcp("/mcp");
 
 // run
 app.Run();
diff --git a/catalog/config/McpAuthenticationConfigurator.cs b/catalog/config/McpAuthenticationConfigurator.cs
@@ -0,0 +1,52 @@
+using System;
+using System.Threading;
+using Microsoft.Extensions.Options;
+using ModelContextProtocol.AspNetCore.Authentication;
+using ModelContextProtocol.Authentication;
+using NetBricks;
+
+namespace Catalog;
+
+/// <summary>
+/// Configures MCP authentication options using the application's OIDC settings.
+/// </summary>
+/// <remarks>
+/// When authentication is enabled, this sets up the
+/// <see cref="ProtectedResourceMetadata"/> so that MCP clients can discover
+/// the OAuth authorization server and complete the OAuth 2.0 flow.
+/// </remarks>
+public class McpAuthenticationConfigurator(IConfigFactory<IConfig> configFactory)
+    : IConfigureNamedOptions<McpAuthenticationOptions>
+{
+    /// <inheritdoc/>
+    public void Configure(string? name, McpAuthenticationOptions options)
+    {
+        if (name != McpAuthenticationDefaults.AuthenticationScheme)
+        {
+            return;
+        }
+        Configure(options);
+    }
+
+    /// <inheritdoc/>
+    public void Configure(McpAuthenticationOptions options)
+    {
+        var config = configFactory.GetAsync(CancellationToken.None).ConfigureAwait(false).GetAwaiter().GetResult();
+        if (!config.IsAuthenticationEnabled)
+        {
+            return;
+        }
+
+        // the API scope ensures Azure AD issues a JWT access token whose audience
+        // matches this API rather than an opaque Microsoft Graph token.
+        // OIDC_CLIENT_ID is required for MCP authentication to work with Azure AD.
+        options.ResourceMetadata = new ProtectedResourceMetadata
+        {
+            AuthorizationServers = { new Uri(config.OIDC_AUTHORITY!) }
+        };
+        if (!string.IsNullOrEmpty(config.OIDC_CLIENT_ID))
+        {
+            options.ResourceMetadata.ScopesSupported = [$"api://{config.OIDC_CLIENT_ID}/.default"];
+        }
+    }
+}