microsoft · WilliamBerryiii · Apr 21, 2026 · Apr 15, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.cspell.json b/.cspell.json
@@ -6,6 +6,15 @@
 	"flagWords": [
 		"TODO"
 	],
+	"words": [
+		"envaccount",
+		"envcontainer",
+		"myacct",
+		"mycontainer",
+		"noseparator",
+		"preds",
+		"xticklabels"
+	],
 	"ignorePaths": [
 		"**/.github/chatmodes/**",
 		"**/node_modules/**",

diff --git a/.cspell/general-technical.txt b/.cspell/general-technical.txt
@@ -593,6 +593,7 @@ interoperates
 interpretability
 interpretml
 interzone
+intoto
 intracloud
 intrazone
 intune
@@ -2025,3 +2026,6 @@ WEBADM
 WEBACCESS
 vsmagent
 vsmserver
+envaccount
+envcontainer
+mycontainer
@@ -8,7 +8,6 @@ on:
 
 permissions:
   contents: read
-  security-events: write
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

@@ -0,0 +1,62 @@
+name: Evaluation Pytest Tests
+
+on:
+  workflow_call:
+    inputs:
+      code-coverage:
+        description: 'Enable Codecov coverage upload'
+        required: false
+        default: false
+        type: boolean
+
+permissions:
+  contents: read
+
+jobs:
+  pytest:
+    name: Evaluation Pytest
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: evaluation
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Setup Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: '3.12'
+
+      - name: Setup uv
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+
+      - name: Install dependencies
+        run: uv sync --only-group dev
+
+      - name: Run pytest
+        run: uv run --only-group dev pytest -v
+
+      - name: Upload coverage.xml artifact
+        if: ${{ inputs.code-coverage && !cancelled() }}
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: evaluation-pytest-coverage-xml
+          path: evaluation/logs/coverage.xml
+          retention-days: 30
+
+      - name: Upload coverage to Codecov
+        if: ${{ inputs.code-coverage && !cancelled() }}
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0
+        with:
+          files: evaluation/logs/coverage.xml
+          use_oidc: true
+          fail_ci_if_error: false
+          verbose: true
+          flags: pytest-evaluation
+          name: evaluation-pytest-coverage
@@ -156,6 +156,16 @@ jobs:
       contents: read
       id-token: write
 
+  # Evaluation domain pytest execution
+  evaluation-pytests:
+    name: Evaluation Pytest
+    uses: ./.github/workflows/evaluation-pytests.yml
+    with:
+      code-coverage: true
+    permissions:
+      contents: read
+      id-token: write
+
   # Fuzz regression via deterministic corpus-based tests
   fuzz-regression-tests:
     name: Fuzz Regression Tests
@@ -254,3 +264,54 @@ jobs:
       contents: read
       security-events: write
       actions: read
+
+  # Aggregator status check required by branch protection.
+  # Reports success only when every upstream PR validation job succeeded or was skipped.
+  pr-validation-summary:
+    name: pr-validation-summary
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    needs:
+      - spell-check
+      - markdown-lint
+      - table-format
+      - frontmatter-validation
+      - msdate-freshness
+      - psscriptanalyzer
+      - yaml-lint
+      - link-lang-check
+      - markdown-link-check
+      - dependency-review
+      - dependency-pinning
+      - pester-tests
+      - dataviewer-frontend-tests
+      - docusaurus-tests
+      - pytest-tests
+      - dataviewer-backend-pytests
+      - evaluation-pytests
+      - fuzz-regression-tests
+      - python-lint
+      - terraform-lint
+      - terraform-validation
+      - terraform-tests
+      - go-lint
+      - terraform-docs-check
+      - go-tests
+      - shellcheck
+      - codeql-analysis
+    permissions:
+      contents: read
+    steps:
+      - name: Verify all upstream jobs succeeded
+        env:
+          NEEDS_JSON: ${{ toJSON(needs) }}
+        run: |
+          echo "Upstream job results:"
+          echo "$NEEDS_JSON"
+          failed=$(echo "$NEEDS_JSON" | jq -r 'to_entries[] | select(.value.result != "success" and .value.result != "skipped") | .key')
+          if [ -n "$failed" ]; then
+            echo "::error::One or more upstream PR validation jobs did not succeed:"
+            echo "$failed"
+            exit 1
+          fi
+          echo "All upstream PR validation jobs succeeded or were skipped."
diff --git a/codecov.yml b/codecov.yml
@@ -5,6 +5,7 @@
 #   pester — PowerShell tests covering scripts/
 #   pytest — Python tests covering training/
 #   pytest-dataviewer — Dataviewer backend tests covering data-management/viewer/backend/src/
+#   pytest-evaluation — Evaluation tests covering evaluation/
 #   pytest-fuzz — Python fuzz regression tests covering tests/, backend, and training
 #   terraform — Terraform tests covering infrastructure/terraform/
 #   vitest — Vitest tests covering data-management/viewer/frontend/src/
@@ -57,6 +58,11 @@ coverage:
           - pytest-fuzz
         target: auto
         threshold: 1%
+      pytest-evaluation:
+        flags:
+          - pytest-evaluation
+        target: auto
+        threshold: 1%
     patch:
       default:
         target: auto
@@ -89,6 +95,10 @@ coverage:
         flags:
           - pytest-fuzz
         informational: true
+      pytest-evaluation:
+        flags:
+          - pytest-evaluation
+        informational: true
 
 flags:
   go:
@@ -121,6 +131,10 @@ flags:
       - data-management/viewer/backend/src/
       - training/
     carryforward: true
+  pytest-evaluation:
+    paths:
+      - evaluation/
+    carryforward: true
 
 parsers:
   jacoco:

@@ -8,8 +8,26 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
     && rm -rf /var/lib/apt/lists/*
 
-# Install uv
-RUN pip install --no-cache-dir uv==0.10.9
+# Install uv (pinned by hash for OSSF Scorecard Pinned-Dependencies; covers all uv 0.10.9 wheels)
+RUN pip install --no-cache-dir --require-hashes uv==0.10.9 \
+    --hash=sha256:0649f83fa0f44f18627c00b2a9a60e5c3486a34799b2c874f2b3945b76048a67 \
+    --hash=sha256:880dd4cffe4bd184e8871ddf4c7d3c3b042e1f16d2682310644aa8d61eaea3e6 \
+    --hash=sha256:a7a784254380552398a6baf4149faf5b31a4003275f685c28421cf8197178a08 \
+    --hash=sha256:5ea0e8598fa012cfa4480ecad4d112bc70f514157c3cc1555a7611c7b6b1ab0a \
+    --hash=sha256:2d6b5367e9bf87eca51c0f2ecda26a1ff931e41409977b4f0a420de2f3e617cf \
+    --hash=sha256:bd04e34db27f9a1d5a0871980edc9f910bb11afbc4abca8234d5a363cbe63c04 \
+    --hash=sha256:547deb57311fc64e4a6b8336228fca4cb4dcbeabdc6e85f14f7804dcd0bc8cd2 \
+    --hash=sha256:e0091b6d0b666640d7407a433860184f77667077b73564e86d49c2a851f073a8 \
+    --hash=sha256:81b2286e6fd869e3507971f39d14829c03e2e31caa8ecc6347b0ffacabb95a5b \
+    --hash=sha256:7c9d6deb30edbc22123be75479f99fb476613eaf38a8034c0e98bba24a344179 \
+    --hash=sha256:24b1ce6d626e06c4582946b6af07b08a032fcccd81fe54c3db3ed2d1c63a97dc \
+    --hash=sha256:fa3401780273d96a2960dbeab58452ce1b387ad8c5da25be6221c0188519e21d \
+    --hash=sha256:8f94a31832d2b4c565312ea17a71b8dd2f971e5aa570c5b796a27b2c9fcdb163 \
+    --hash=sha256:842c39c19d9072f1ad53c71bb4ecd1c9caa311d5de9d19e09a636274a6c95e2e \
+    --hash=sha256:ed44047c602449916ba18a8596715ef7edbbd00859f3db9eac010dc62a0edd30 \
+    --hash=sha256:af79552276d8bd622048ab2d67ec22120a6af64d83963c46b1482218c27b571f \
+    --hash=sha256:47e18a0521d76293d4f60d129f520b18bddf1976b4a47b50f0fcb04fb6a9d40f \
+    --hash=sha256:31e76ae92e70fec47c3efab0c8094035ad7a578454482415b496fa39fc4d685c
 
 # Copy dependency manifests
 COPY pyproject.toml uv.lock ./

@@ -24,6 +24,7 @@ Security documentation for the Physical AI Toolchain covering threat analysis, d
 | [Threat Model](threat-model.md)                                                         | STRIDE-based threat analysis and remediation roadmap             |
 | [Deployment Security Guide](../operations/security-guide.md)                            | Security configuration inventory and deployment responsibilities |
 | [Release Verification](release-verification.md)                                         | Verify release artifact provenance and SBOM attestations         |
+| [Workflow Permissions](workflow-permissions.md)                                         | GitHub Actions permission scopes and OSSF Scorecard exceptions   |
 | [SECURITY.md](https://github.com/microsoft/physical-ai-toolchain/blob/main/SECURITY.md) | Vulnerability disclosure and reporting process                   |
 
 ## 🔒 Security Posture

@@ -0,0 +1,69 @@
+---
+sidebar_position: 4
+title: Workflow Permissions
+description: GitHub Actions permission scopes and OSSF Scorecard Token-Permissions exception rationale
+author: Microsoft Robotics-AI Team
+ms.date: 2026-02-22
+ms.topic: reference
+keywords:
+  - security
+  - github-actions
+  - permissions
+  - ossf-scorecard
+  - token-permissions
+---
+
+## 📋 Overview
+
+All GitHub Actions workflows in this repository follow the [OpenSSF Scorecard Token-Permissions](https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions) principle:
+
+- Top-level `permissions:` is `contents: read` (read-only by default).
+- Write-scoped permissions are declared at the **job level** only when a specific step requires them.
+- No workflow grants `permissions: write-all` or omits an explicit top-level `permissions:` block.
+
+This document enumerates every job-scoped write permission across `.github/workflows/` and records the justification so security auditors and Scorecard reviewers can verify each exception.
+
+## 🔒 Job-Scoped Write Permissions
+
+The 15 write permissions below are required by the action or CLI invoked in the corresponding job. Each grant is the minimum scope needed.
+
+| Workflow                          | Job                            | Permission                | Rationale                                                                                                                              |
+|-----------------------------------|--------------------------------|---------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
+| `check-binary-integrity.yml`      | `check-binary-integrity`       | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish binary integrity findings to the Security tab.                              |
+| `codeql-analysis.yml`             | `analyze`                      | `security-events: write`  | Required by `github/codeql-action/analyze` to upload CodeQL SARIF results to the Security tab.                                         |
+| `dast-zap-scan.yml`               | `dast-zap-scan`                | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish ZAP DAST findings to the Security tab.                                      |
+| `dependency-pinning-scan.yml`     | `dependency-pinning-scan`      | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish SHA-pinning findings to the Security tab.                                   |
+| `gitleaks-scan.yml`               | `scan`                         | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish secret-scanning findings to the Security tab.                               |
+| `main.yml`                        | `dependency-pinning`           | `security-events: write`  | Inherited by reusable `dependency-pinning-scan.yml`; required for SARIF upload.                                                        |
+| `main.yml`                        | `codeql-analysis`              | `security-events: write`  | Inherited by reusable `codeql-analysis.yml`; required for SARIF upload.                                                                |
+| `main.yml`                        | `generate-dependency-sbom`     | `contents: write`         | Required by `gh release upload "${TAG}" dependencies.spdx.json --clobber` to attach the dependency SBOM to the release.                |
+| `main.yml`                        | `attest-release`               | `attestations: write`     | Required by `actions/attest-build-provenance` and `actions/attest` to create Sigstore provenance attestations.                         |
+| `main.yml`                        | `attest-release`               | `contents: write`         | Required by `gh release upload` to attach `*.sigstore.json` and `*.intoto.jsonl` attestation artifacts to the release.                 |
+| `main.yml`                        | `sbom-diff`                    | `contents: write`         | Required by `gh release upload "${TAG}" dependency-diff.md --clobber` to attach the dependency-change report to the release.           |
+| `main.yml`                        | `append-verification-notes`    | `contents: write`         | Required by `gh release edit` to append artifact-verification instructions to the release body.                                        |
+| `pr-validation.yml`               | `dependency-pinning`           | `security-events: write`  | Inherited by reusable `dependency-pinning-scan.yml`; required for SARIF upload.                                                        |
+| `pr-validation.yml`               | `codeql-analysis`              | `security-events: write`  | Inherited by reusable `codeql-analysis.yml`; required for SARIF upload.                                                                |
+| `scorecard.yml`                   | `analysis`                     | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish OpenSSF Scorecard findings to the Security tab.                             |
+
+## 🛡️ Defense in Depth
+
+The release-publishing path uses additional hardening beyond minimum permissions:
+
+- All actions are SHA-pinned (no floating tags).
+- `persist-credentials: false` on every `actions/checkout` invocation.
+- `id-token: write` is granted only to jobs that mint Sigstore OIDC tokens; the token is never exposed to user-controlled steps.
+- Release-gated jobs (`generate-dependency-sbom`, `attest-release`, `sbom-diff`, `append-verification-notes`) run only when `release-please` produces a release (`needs.release-please.outputs.release_created == 'true'`).
+
+## 🔗 Related Resources
+
+- [OpenSSF Scorecard Token-Permissions check](https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions)
+- [GitHub Actions: Assigning permissions to jobs](https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs)
+- [Release Verification](release-verification.md)
+- [Threat Model](threat-model.md)
+
+<!-- markdownlint-configure-file { "MD024": false } -->
+
+<!-- markdownlint-disable MD036 -->
+*🤖 Crafted with precision by ✨Copilot following brilliant human instruction,
+then carefully refined by our team of discerning human reviewers.*
+<!-- markdownlint-enable MD036 -->
diff --git a/evaluation/metrics/bootstrap_mlflow.py b/evaluation/metrics/bootstrap_mlflow.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from pathlib import Path
 
 import mlflow
 from azure.ai.ml import MLClient
@@ -29,7 +30,8 @@
     experiment_name = f"lerobot-{os.environ.get('POLICY_TYPE', 'act')}-inference"
 mlflow.set_experiment(experiment_name)
 
-with open("/tmp/mlflow_config.env", "w") as f:
+config_path = Path(os.environ.get("MLFLOW_CONFIG_PATH", "/tmp/mlflow_config.env"))
+with config_path.open("w") as f:
     f.write(f"MLFLOW_TRACKING_URI={tracking_uri}\n")
     f.write(f"MLFLOW_EXPERIMENT_NAME={experiment_name}\n")
 

diff --git a/evaluation/metrics/upload_artifacts.py b/evaluation/metrics/upload_artifacts.py
@@ -7,7 +7,7 @@
 import json
 import os
 import traceback
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from urllib.parse import urlparse
 
@@ -286,7 +286,7 @@ def main() -> None:
     blob_container = os.environ.get("BLOB_CONTAINER", "")
     onnx_success = os.environ["ONNX_SUCCESS"] == "1"
     jit_success = os.environ["JIT_SUCCESS"] == "1"
-    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
 
     onnx_metrics, jit_metrics = load_metrics(metrics_dir)