Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
d063d39
feat: add azd CLI evaluation and testing framework
spboyer Mar 19, 2026
a9bd3d3
docs: add authentication and secrets section to eval README
spboyer Mar 19, 2026
1f6265e
docs: add comprehensive how-to guides for creating evals, graders, an…
spboyer Mar 19, 2026
1e07891
fix: resolve CI failures in eval unit tests and cspell
spboyer Mar 19, 2026
a951d54
fix: stop command-sequencing tests from overriding AZD_CONFIG_DIR
spboyer Mar 19, 2026
4c59d38
docs: expand auth section with subscription config and no-popup guara…
spboyer Mar 19, 2026
606beb8
refactor: address review feedback from @jongio and Copilot
spboyer Mar 19, 2026
f680753
fix: address round 2 review feedback from @jongio
spboyer Mar 20, 2026
3961833
Fix review: mock targets, env remove, CI grader tests, HTTPError hand…
spboyer Mar 26, 2026
9ccc527
fix: skip waza validation when CLI not installed
spboyer Mar 26, 2026
54bb3d6
Update cli/azd/test/eval/tasks/environment/switch-env.yaml
spboyer Mar 30, 2026
820d538
Update cli/azd/test/eval/README.md
spboyer Mar 30, 2026
a56c645
fix: address remaining PR review feedback
spboyer Mar 30, 2026
c1b4978
chore: add 40-minute timeout to eval-e2e lifecycle step
spboyer Mar 30, 2026
c9670b5
Address review feedback on eval framework
spboyer Mar 31, 2026
317c117
Remove continue-on-error masking in eval-waza.yml
spboyer Mar 31, 2026
2165abd
Fix error handling in graders and cleanup step
spboyer Mar 31, 2026
5c42dd1
Address review feedback on eval framework
spboyer Mar 31, 2026
70e2a82
Address review feedback on eval framework (round 2)
spboyer Apr 1, 2026
dd8b709
Re-trigger CI (flaky TestInitializer_PromptIfNonEmpty)
spboyer Apr 1, 2026
47e1491
Use tag-for-deletion pattern in eval cleanup step
spboyer Apr 2, 2026
281907a
Address review feedback: timeout, imports, pipeline command
spboyer Apr 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .github/workflows/eval-e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: "Eval: E2E Lifecycle"

on:
schedule:
# 6am UTC Monday
- cron: "0 6 * * 1"
workflow_dispatch:

permissions:
id-token: write
contents: read

jobs:
e2e-lifecycle:
runs-on: ubuntu-latest
env:
AZURE_ENV_NAME: eval-e2e-${{ github.run_id }}
steps:
- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version-file: "cli/azd/go.mod"

- uses: actions/setup-node@v4
with:
node-version: "22"

- name: Build azd
working-directory: cli/azd
run: go build -o ./azd .

- name: Add azd to PATH
run: echo "${{ github.workspace }}/cli/azd" >> "$GITHUB_PATH"

- name: Azure Login (OIDC)
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Install Waza CLI
run: npm install -g waza

- name: Install eval dependencies
working-directory: cli/azd/test/eval
run: npm ci

- name: Run lifecycle evaluations
working-directory: cli/azd/test/eval
continue-on-error: true
timeout-minutes: 40
env:
COPILOT_CLI_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
run: waza run --executor copilot-sdk --filter "tasks/lifecycle/"

- name: Upload E2E results
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-results-${{ github.run_id }}
path: cli/azd/test/eval/reports/
retention-days: 30

- name: Cleanup Azure resources
if: always()
shell: pwsh
run: |
# Tag matching resource groups for deletion so the cleanup script
# can detect and remove resources that resist deletion.
$deleteAfter = (Get-Date).ToUniversalTime().AddHours(1).ToString('o')
$groupsToDelete = az group list --query "[?starts_with(name, 'rg-eval-')].name" -o tsv

foreach ($group in $groupsToDelete) {
az group update --name $group --set "tags.DeleteAfter=$deleteAfter" 2>$null
}

foreach ($group in $groupsToDelete) {
az group delete --name $group --yes --no-wait 2>$null
}
continue-on-error: true
62 changes: 62 additions & 0 deletions .github/workflows/eval-report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
name: "Eval: Weekly Report"

on:
schedule:
# 8am UTC Monday, after E2E completes
- cron: "0 8 * * 1"
workflow_dispatch:

permissions:
contents: read
actions: read

jobs:
generate-report:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-node@v4
with:
node-version: "22"

- name: Install eval dependencies
working-directory: cli/azd/test/eval
run: npm ci

- name: Download recent Waza artifacts
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p cli/azd/test/eval/reports/waza
RUN_ID=$(gh api "repos/${{ github.repository }}/actions/workflows/eval-waza.yml/runs?branch=main" \
--jq '.workflow_runs | map(select(.conclusion == "success")) | .[0].id // empty' 2>/dev/null)
if [ -n "$RUN_ID" ]; then
gh run download "$RUN_ID" -D cli/azd/test/eval/reports/waza 2>/dev/null || echo "No waza artifacts found"
else
echo "No successful waza runs found, skipping"
fi

- name: Download recent E2E artifacts
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p cli/azd/test/eval/reports/e2e
RUN_ID=$(gh api "repos/${{ github.repository }}/actions/workflows/eval-e2e.yml/runs?branch=main" \
--jq '.workflow_runs | map(select(.conclusion == "success")) | .[0].id // empty' 2>/dev/null)
if [ -n "$RUN_ID" ]; then
gh run download "$RUN_ID" -D cli/azd/test/eval/reports/e2e 2>/dev/null || echo "No e2e artifacts found"
else
echo "No successful e2e runs found, skipping"
fi

# TODO: Implement report generation script (scripts/generate-report.ts)
# that diffs Waza result JSON files and produces regression-issues.json.
# Once implemented, add a step to create GitHub issues from regressions.

- name: Upload aggregated artifacts
uses: actions/upload-artifact@v4
with:
name: eval-weekly-report-${{ github.run_id }}
path: cli/azd/test/eval/reports/
retention-days: 90
67 changes: 67 additions & 0 deletions .github/workflows/eval-unit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: "Eval: Unit Tests"

on:
pull_request:
paths:
- "cli/azd/test/eval/**"
- "cli/azd/internal/mcp/**"
- "cli/azd/cmd/mcp.go"
- "cli/azd/cmd/root.go"

permissions:
contents: read

jobs:
unit-tests:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version-file: "cli/azd/go.mod"

- uses: actions/setup-node@v4
with:
node-version: "22"

- name: Build azd
working-directory: cli/azd
run: go build -o ./azd .

- name: Install eval dependencies
working-directory: cli/azd/test/eval
run: npm ci

- name: Run unit tests
working-directory: cli/azd/test/eval
run: npm run test:unit -- --ci

- name: Validate Waza task YAML
working-directory: cli/azd/test/eval
run: |
if command -v waza &>/dev/null; then
npm run waza:validate
else
echo "waza CLI not installed, skipping YAML validation"
fi

- uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install grader test dependencies
run: pip install pytest

- name: Run grader tests
working-directory: cli/azd/test/eval/graders
run: python -m pytest test_graders.py -v

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-unit-results
path: cli/azd/test/eval/reports/
retention-days: 30
53 changes: 53 additions & 0 deletions .github/workflows/eval-waza.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: "Eval: Waza Runs"

on:
schedule:
# 5am, 12pm, 8pm UTC, Tuesday through Saturday
- cron: "0 5,12,20 * * 2-6"
workflow_dispatch:

permissions:
contents: read

jobs:
waza-run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version-file: "cli/azd/go.mod"

- uses: actions/setup-node@v4
with:
node-version: "22"

- name: Build azd
working-directory: cli/azd
run: go build -o ./azd .

- name: Add azd to PATH
run: echo "${{ github.workspace }}/cli/azd" >> "$GITHUB_PATH"

- name: Install Waza CLI
run: npm install -g waza

- name: Install eval dependencies
working-directory: cli/azd/test/eval
run: npm ci

- name: Run Waza evaluations
timeout-minutes: 30
working-directory: cli/azd/test/eval
env:
COPILOT_CLI_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }}
run: waza run --executor copilot-sdk

- name: Upload Waza results
if: always() # Upload results even when evals fail
uses: actions/upload-artifact@v4
with:
name: waza-results-${{ github.run_id }}
path: cli/azd/test/eval/reports/
retention-days: 30
22 changes: 22 additions & 0 deletions cli/azd/.vscode/cspell.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,28 @@ overrides:
words:
- covdata
- GOWORK
- filename: test/eval/README.md
words:
- Waza
- waza
- urlopen
- filename: "test/eval/graders/*.py"
words:
- Waza
- waza
- hdrs
- mysite
- mydb
- filename: "test/eval/tasks/**/*.yaml"
words:
- authenticat
- idempoten
- filename: "test/eval/tests/human/*.test.ts"
words:
- compdef
- badcfg
- provison
- notacommand
ignorePaths:
- "**/*_test.go"
- "**/mock*.go"
Expand Down
6 changes: 6 additions & 0 deletions cli/azd/test/eval/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
node_modules/
dist/
reports/*.json
reports/*.md
reports/junit.xml
!reports/.gitkeep
Loading
Loading