diff --git a/.cspell.json b/.cspell.json
index e74e589b..128c516c 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -6,6 +6,15 @@
 	"flagWords": [
 		"TODO"
 	],
+	"words": [
+		"envaccount",
+		"envcontainer",
+		"myacct",
+		"mycontainer",
+		"noseparator",
+		"preds",
+		"xticklabels"
+	],
 	"ignorePaths": [
 		"**/.github/chatmodes/**",
 		"**/node_modules/**",
diff --git a/.cspell/general-technical.txt b/.cspell/general-technical.txt
index 5b92a679..c3c80224 100644
--- a/.cspell/general-technical.txt
+++ b/.cspell/general-technical.txt
@@ -593,6 +593,7 @@ interoperates
 interpretability
 interpretml
 interzone
+intoto
 intracloud
 intrazone
 intune
@@ -2025,3 +2026,6 @@ WEBADM
 WEBACCESS
 vsmagent
 vsmserver
+envaccount
+envcontainer
+mycontainer
diff --git a/.github/workflows/check-binary-integrity.yml b/.github/workflows/check-binary-integrity.yml
index 26f70da2..e2a2aff3 100644
--- a/.github/workflows/check-binary-integrity.yml
+++ b/.github/workflows/check-binary-integrity.yml
@@ -8,7 +8,6 @@ on:
 
 permissions:
   contents: read
-  security-events: write
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
diff --git a/.github/workflows/evaluation-pytests.yml b/.github/workflows/evaluation-pytests.yml
new file mode 100644
index 00000000..297af9a3
--- /dev/null
+++ b/.github/workflows/evaluation-pytests.yml
@@ -0,0 +1,62 @@
+name: Evaluation Pytest Tests
+
+on:
+  workflow_call:
+    inputs:
+      code-coverage:
+        description: 'Enable Codecov coverage upload'
+        required: false
+        default: false
+        type: boolean
+
+permissions:
+  contents: read
+
+jobs:
+  pytest:
+    name: Evaluation Pytest
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: evaluation
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Setup Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: '3.12'
+
+      - name: Setup uv
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+
+      - name: Install dependencies
+        run: uv sync --only-group dev
+
+      - name: Run pytest
+        run: uv run --only-group dev pytest -v
+
+      - name: Upload coverage.xml artifact
+        if: ${{ inputs.code-coverage && !cancelled() }}
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: evaluation-pytest-coverage-xml
+          path: evaluation/logs/coverage.xml
+          retention-days: 30
+
+      - name: Upload coverage to Codecov
+        if: ${{ inputs.code-coverage && !cancelled() }}
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0
+        with:
+          files: evaluation/logs/coverage.xml
+          use_oidc: true
+          fail_ci_if_error: false
+          verbose: true
+          flags: pytest-evaluation
+          name: evaluation-pytest-coverage
diff --git a/.github/workflows/pr-validation.yml b/.github/workflows/pr-validation.yml
index a3ccc2cb..c8cce712 100644
--- a/.github/workflows/pr-validation.yml
+++ b/.github/workflows/pr-validation.yml
@@ -156,6 +156,16 @@ jobs:
       contents: read
       id-token: write
 
+  # Evaluation domain pytest execution
+  evaluation-pytests:
+    name: Evaluation Pytest
+    uses: ./.github/workflows/evaluation-pytests.yml
+    with:
+      code-coverage: true
+    permissions:
+      contents: read
+      id-token: write
+
   # Fuzz regression via deterministic corpus-based tests
   fuzz-regression-tests:
     name: Fuzz Regression Tests
@@ -254,3 +264,54 @@ jobs:
       contents: read
       security-events: write
       actions: read
+
+  # Aggregator status check required by branch protection.
+  # Reports success only when every upstream PR validation job succeeded or was skipped.
+  pr-validation-summary:
+    name: pr-validation-summary
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    needs:
+      - spell-check
+      - markdown-lint
+      - table-format
+      - frontmatter-validation
+      - msdate-freshness
+      - psscriptanalyzer
+      - yaml-lint
+      - link-lang-check
+      - markdown-link-check
+      - dependency-review
+      - dependency-pinning
+      - pester-tests
+      - dataviewer-frontend-tests
+      - docusaurus-tests
+      - pytest-tests
+      - dataviewer-backend-pytests
+      - evaluation-pytests
+      - fuzz-regression-tests
+      - python-lint
+      - terraform-lint
+      - terraform-validation
+      - terraform-tests
+      - go-lint
+      - terraform-docs-check
+      - go-tests
+      - shellcheck
+      - codeql-analysis
+    permissions:
+      contents: read
+    steps:
+      - name: Verify all upstream jobs succeeded
+        env:
+          NEEDS_JSON: ${{ toJSON(needs) }}
+        run: |
+          echo "Upstream job results:"
+          echo "$NEEDS_JSON"
+          failed=$(echo "$NEEDS_JSON" | jq -r 'to_entries[] | select(.value.result != "success" and .value.result != "skipped") | .key')
+          if [ -n "$failed" ]; then
+            echo "::error::One or more upstream PR validation jobs did not succeed:"
+            echo "$failed"
+            exit 1
+          fi
+          echo "All upstream PR validation jobs succeeded or were skipped."
diff --git a/codecov.yml b/codecov.yml
index c11f6f3d..e784a035 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -5,6 +5,7 @@
 #   pester — PowerShell tests covering scripts/
 #   pytest — Python tests covering training/
 #   pytest-dataviewer — Dataviewer backend tests covering data-management/viewer/backend/src/
+#   pytest-evaluation — Evaluation tests covering evaluation/
 #   pytest-fuzz — Python fuzz regression tests covering tests/, backend, and training
 #   terraform — Terraform tests covering infrastructure/terraform/
 #   vitest — Vitest tests covering data-management/viewer/frontend/src/
@@ -57,6 +58,11 @@ coverage:
           - pytest-fuzz
         target: auto
         threshold: 1%
+      pytest-evaluation:
+        flags:
+          - pytest-evaluation
+        target: auto
+        threshold: 1%
     patch:
       default:
         target: auto
@@ -89,6 +95,10 @@ coverage:
         flags:
           - pytest-fuzz
         informational: true
+      pytest-evaluation:
+        flags:
+          - pytest-evaluation
+        informational: true
 
 flags:
   go:
@@ -121,6 +131,10 @@ flags:
       - data-management/viewer/backend/src/
       - training/
     carryforward: true
+  pytest-evaluation:
+    paths:
+      - evaluation/
+    carryforward: true
 
 parsers:
   jacoco:
diff --git a/data-management/viewer/backend/Dockerfile b/data-management/viewer/backend/Dockerfile
index 1b4c1021..47846b47 100644
--- a/data-management/viewer/backend/Dockerfile
+++ b/data-management/viewer/backend/Dockerfile
@@ -8,8 +8,26 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
     && rm -rf /var/lib/apt/lists/*
 
-# Install uv
-RUN pip install --no-cache-dir uv==0.10.9
+# Install uv (pinned by hash for OSSF Scorecard Pinned-Dependencies; covers all uv 0.10.9 wheels)
+RUN pip install --no-cache-dir --require-hashes uv==0.10.9 \
+    --hash=sha256:0649f83fa0f44f18627c00b2a9a60e5c3486a34799b2c874f2b3945b76048a67 \
+    --hash=sha256:880dd4cffe4bd184e8871ddf4c7d3c3b042e1f16d2682310644aa8d61eaea3e6 \
+    --hash=sha256:a7a784254380552398a6baf4149faf5b31a4003275f685c28421cf8197178a08 \
+    --hash=sha256:5ea0e8598fa012cfa4480ecad4d112bc70f514157c3cc1555a7611c7b6b1ab0a \
+    --hash=sha256:2d6b5367e9bf87eca51c0f2ecda26a1ff931e41409977b4f0a420de2f3e617cf \
+    --hash=sha256:bd04e34db27f9a1d5a0871980edc9f910bb11afbc4abca8234d5a363cbe63c04 \
+    --hash=sha256:547deb57311fc64e4a6b8336228fca4cb4dcbeabdc6e85f14f7804dcd0bc8cd2 \
+    --hash=sha256:e0091b6d0b666640d7407a433860184f77667077b73564e86d49c2a851f073a8 \
+    --hash=sha256:81b2286e6fd869e3507971f39d14829c03e2e31caa8ecc6347b0ffacabb95a5b \
+    --hash=sha256:7c9d6deb30edbc22123be75479f99fb476613eaf38a8034c0e98bba24a344179 \
+    --hash=sha256:24b1ce6d626e06c4582946b6af07b08a032fcccd81fe54c3db3ed2d1c63a97dc \
+    --hash=sha256:fa3401780273d96a2960dbeab58452ce1b387ad8c5da25be6221c0188519e21d \
+    --hash=sha256:8f94a31832d2b4c565312ea17a71b8dd2f971e5aa570c5b796a27b2c9fcdb163 \
+    --hash=sha256:842c39c19d9072f1ad53c71bb4ecd1c9caa311d5de9d19e09a636274a6c95e2e \
+    --hash=sha256:ed44047c602449916ba18a8596715ef7edbbd00859f3db9eac010dc62a0edd30 \
+    --hash=sha256:af79552276d8bd622048ab2d67ec22120a6af64d83963c46b1482218c27b571f \
+    --hash=sha256:47e18a0521d76293d4f60d129f520b18bddf1976b4a47b50f0fcb04fb6a9d40f \
+    --hash=sha256:31e76ae92e70fec47c3efab0c8094035ad7a578454482415b496fa39fc4d685c
 
 # Copy dependency manifests
 COPY pyproject.toml uv.lock ./
diff --git a/docs/security/README.md b/docs/security/README.md
index da7485c3..7951a55e 100644
--- a/docs/security/README.md
+++ b/docs/security/README.md
@@ -24,6 +24,7 @@ Security documentation for the Physical AI Toolchain covering threat analysis, d
 | [Threat Model](threat-model.md)                                                         | STRIDE-based threat analysis and remediation roadmap             |
 | [Deployment Security Guide](../operations/security-guide.md)                            | Security configuration inventory and deployment responsibilities |
 | [Release Verification](release-verification.md)                                         | Verify release artifact provenance and SBOM attestations         |
+| [Workflow Permissions](workflow-permissions.md)                                         | GitHub Actions permission scopes and OSSF Scorecard exceptions   |
 | [SECURITY.md](https://github.com/microsoft/physical-ai-toolchain/blob/main/SECURITY.md) | Vulnerability disclosure and reporting process                   |
 
 ## 🔒 Security Posture
diff --git a/docs/security/workflow-permissions.md b/docs/security/workflow-permissions.md
new file mode 100644
index 00000000..0a7b9c2d
--- /dev/null
+++ b/docs/security/workflow-permissions.md
@@ -0,0 +1,69 @@
+---
+sidebar_position: 4
+title: Workflow Permissions
+description: GitHub Actions permission scopes and OSSF Scorecard Token-Permissions exception rationale
+author: Microsoft Robotics-AI Team
+ms.date: 2026-02-22
+ms.topic: reference
+keywords:
+  - security
+  - github-actions
+  - permissions
+  - ossf-scorecard
+  - token-permissions
+---
+
+## 📋 Overview
+
+All GitHub Actions workflows in this repository follow the [OpenSSF Scorecard Token-Permissions](https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions) principle:
+
+- Top-level `permissions:` is `contents: read` (read-only by default).
+- Write-scoped permissions are declared at the **job level** only when a specific step requires them.
+- No workflow grants `permissions: write-all` or omits an explicit top-level `permissions:` block.
+
+This document enumerates every job-scoped write permission across `.github/workflows/` and records the justification so security auditors and Scorecard reviewers can verify each exception.
+
+## 🔒 Job-Scoped Write Permissions
+
+The 15 write permissions below are required by the action or CLI invoked in the corresponding job. Each grant is the minimum scope needed.
+
+| Workflow                          | Job                            | Permission                | Rationale                                                                                                                              |
+|-----------------------------------|--------------------------------|---------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
+| `check-binary-integrity.yml`      | `check-binary-integrity`       | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish binary integrity findings to the Security tab.                              |
+| `codeql-analysis.yml`             | `analyze`                      | `security-events: write`  | Required by `github/codeql-action/analyze` to upload CodeQL SARIF results to the Security tab.                                         |
+| `dast-zap-scan.yml`               | `dast-zap-scan`                | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish ZAP DAST findings to the Security tab.                                      |
+| `dependency-pinning-scan.yml`     | `dependency-pinning-scan`      | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish SHA-pinning findings to the Security tab.                                   |
+| `gitleaks-scan.yml`               | `scan`                         | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish secret-scanning findings to the Security tab.                               |
+| `main.yml`                        | `dependency-pinning`           | `security-events: write`  | Inherited by reusable `dependency-pinning-scan.yml`; required for SARIF upload.                                                        |
+| `main.yml`                        | `codeql-analysis`              | `security-events: write`  | Inherited by reusable `codeql-analysis.yml`; required for SARIF upload.                                                                |
+| `main.yml`                        | `generate-dependency-sbom`     | `contents: write`         | Required by `gh release upload "${TAG}" dependencies.spdx.json --clobber` to attach the dependency SBOM to the release.                |
+| `main.yml`                        | `attest-release`               | `attestations: write`     | Required by `actions/attest-build-provenance` and `actions/attest` to create Sigstore provenance attestations.                         |
+| `main.yml`                        | `attest-release`               | `contents: write`         | Required by `gh release upload` to attach `*.sigstore.json` and `*.intoto.jsonl` attestation artifacts to the release.                 |
+| `main.yml`                        | `sbom-diff`                    | `contents: write`         | Required by `gh release upload "${TAG}" dependency-diff.md --clobber` to attach the dependency-change report to the release.           |
+| `main.yml`                        | `append-verification-notes`    | `contents: write`         | Required by `gh release edit` to append artifact-verification instructions to the release body.                                        |
+| `pr-validation.yml`               | `dependency-pinning`           | `security-events: write`  | Inherited by reusable `dependency-pinning-scan.yml`; required for SARIF upload.                                                        |
+| `pr-validation.yml`               | `codeql-analysis`              | `security-events: write`  | Inherited by reusable `codeql-analysis.yml`; required for SARIF upload.                                                                |
+| `scorecard.yml`                   | `analysis`                     | `security-events: write`  | Required by `github/codeql-action/upload-sarif` to publish OpenSSF Scorecard findings to the Security tab.                             |
+
+## 🛡️ Defense in Depth
+
+The release-publishing path uses additional hardening beyond minimum permissions:
+
+- All actions are SHA-pinned (no floating tags).
+- `persist-credentials: false` on every `actions/checkout` invocation.
+- `id-token: write` is granted only to jobs that mint Sigstore OIDC tokens; the token is never exposed to user-controlled steps.
+- Release-gated jobs (`generate-dependency-sbom`, `attest-release`, `sbom-diff`, `append-verification-notes`) run only when `release-please` produces a release (`needs.release-please.outputs.release_created == 'true'`).
+
+## 🔗 Related Resources
+
+- [OpenSSF Scorecard Token-Permissions check](https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions)
+- [GitHub Actions: Assigning permissions to jobs](https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs)
+- [Release Verification](release-verification.md)
+- [Threat Model](threat-model.md)
+
+<!-- markdownlint-configure-file { "MD024": false } -->
+
+<!-- markdownlint-disable MD036 -->
+*🤖 Crafted with precision by ✨Copilot following brilliant human instruction,
+then carefully refined by our team of discerning human reviewers.*
+<!-- markdownlint-enable MD036 -->
diff --git a/evaluation/metrics/bootstrap_mlflow.py b/evaluation/metrics/bootstrap_mlflow.py
index 5e19bab8..2374cbe6 100644
--- a/evaluation/metrics/bootstrap_mlflow.py
+++ b/evaluation/metrics/bootstrap_mlflow.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from pathlib import Path
 
 import mlflow
 from azure.ai.ml import MLClient
@@ -29,7 +30,8 @@
     experiment_name = f"lerobot-{os.environ.get('POLICY_TYPE', 'act')}-inference"
 mlflow.set_experiment(experiment_name)
 
-with open("/tmp/mlflow_config.env", "w") as f:
+config_path = Path(os.environ.get("MLFLOW_CONFIG_PATH", "/tmp/mlflow_config.env"))
+with config_path.open("w") as f:
     f.write(f"MLFLOW_TRACKING_URI={tracking_uri}\n")
     f.write(f"MLFLOW_EXPERIMENT_NAME={experiment_name}\n")
 
diff --git a/evaluation/metrics/upload_artifacts.py b/evaluation/metrics/upload_artifacts.py
index a2c9c00a..0f00d982 100644
--- a/evaluation/metrics/upload_artifacts.py
+++ b/evaluation/metrics/upload_artifacts.py
@@ -7,7 +7,7 @@
 import json
 import os
 import traceback
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from urllib.parse import urlparse
 
@@ -286,7 +286,7 @@ def main() -> None:
     blob_container = os.environ.get("BLOB_CONTAINER", "")
     onnx_success = os.environ["ONNX_SUCCESS"] == "1"
     jit_success = os.environ["JIT_SUCCESS"] == "1"
-    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
 
     onnx_metrics, jit_metrics = load_metrics(metrics_dir)
 
diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml
index 2f9ea0dc..b7e0b6d8 100644
--- a/evaluation/pyproject.toml
+++ b/evaluation/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
     "azure-ai-ml==1.32.0",
     "marshmallow==3.26.2",
     "mlflow==3.11.1",
-    "packaging==26.1",
+    "packaging==25.0",
     "psutil==7.2.2",
     "pynvml==13.0.1",
     "pyperclip==1.11.0",
@@ -31,3 +31,55 @@ build-backend = "hatchling.build"
 
 [tool.uv]
 package = false
+
+[dependency-groups]
+dev = [
+    "pytest==9.0.3",
+    "pytest-mock==3.15.1",
+    "pytest-cov==7.1.0",
+    "hypothesis==6.151.13",
+    "matplotlib==3.10.8",
+    "numpy==2.2.6",
+    "torch==2.10.0",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]
+addopts = [
+    "-ra",
+    "--strict-markers",
+    "--strict-config",
+    "--cov=sil",
+    "--cov=metrics",
+    "--cov-report=term-missing",
+    "--cov-report=xml:logs/coverage.xml",
+    "--junitxml=logs/pytest-results.xml",
+]
+markers = [
+    "e2e: marks tests requiring Isaac Sim or GPU hardware (deselect: -m 'not e2e')",
+]
+
+[tool.coverage.run]
+source = ["sil", "metrics"]
+branch = true
+omit = [
+    "**/conftest.py",
+    "**/__init__.py",
+    "sil/monitor_checkpoints.py",
+    "sil/play.py",
+    "sil/play_policy.py",
+]
+
+[tool.coverage.report]
+show_missing = true
+precision = 2
+exclude_lines = [
+    "pragma: no cover",
+    "if __name__ == .__main__.",
+    "if TYPE_CHECKING:",
+    "raise NotImplementedError",
+]
+
+[tool.coverage.xml]
+output = "logs/coverage.xml"
diff --git a/evaluation/sil/docker/Dockerfile.lerobot-eval b/evaluation/sil/docker/Dockerfile.lerobot-eval
index fb468076..48daf584 100644
--- a/evaluation/sil/docker/Dockerfile.lerobot-eval
+++ b/evaluation/sil/docker/Dockerfile.lerobot-eval
@@ -5,7 +5,26 @@ RUN apt-get update -qq && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 COPY requirements-lerobot-eval.txt .
-RUN pip install --no-cache-dir "uv==0.10.9" && \
+# Install uv (pinned by hash for OSSF Scorecard Pinned-Dependencies; covers all uv 0.10.9 wheels)
+RUN pip install --no-cache-dir --require-hashes "uv==0.10.9" \
+    --hash=sha256:0649f83fa0f44f18627c00b2a9a60e5c3486a34799b2c874f2b3945b76048a67 \
+    --hash=sha256:880dd4cffe4bd184e8871ddf4c7d3c3b042e1f16d2682310644aa8d61eaea3e6 \
+    --hash=sha256:a7a784254380552398a6baf4149faf5b31a4003275f685c28421cf8197178a08 \
+    --hash=sha256:5ea0e8598fa012cfa4480ecad4d112bc70f514157c3cc1555a7611c7b6b1ab0a \
+    --hash=sha256:2d6b5367e9bf87eca51c0f2ecda26a1ff931e41409977b4f0a420de2f3e617cf \
+    --hash=sha256:bd04e34db27f9a1d5a0871980edc9f910bb11afbc4abca8234d5a363cbe63c04 \
+    --hash=sha256:547deb57311fc64e4a6b8336228fca4cb4dcbeabdc6e85f14f7804dcd0bc8cd2 \
+    --hash=sha256:e0091b6d0b666640d7407a433860184f77667077b73564e86d49c2a851f073a8 \
+    --hash=sha256:81b2286e6fd869e3507971f39d14829c03e2e31caa8ecc6347b0ffacabb95a5b \
+    --hash=sha256:7c9d6deb30edbc22123be75479f99fb476613eaf38a8034c0e98bba24a344179 \
+    --hash=sha256:24b1ce6d626e06c4582946b6af07b08a032fcccd81fe54c3db3ed2d1c63a97dc \
+    --hash=sha256:fa3401780273d96a2960dbeab58452ce1b387ad8c5da25be6221c0188519e21d \
+    --hash=sha256:8f94a31832d2b4c565312ea17a71b8dd2f971e5aa570c5b796a27b2c9fcdb163 \
+    --hash=sha256:842c39c19d9072f1ad53c71bb4ecd1c9caa311d5de9d19e09a636274a6c95e2e \
+    --hash=sha256:ed44047c602449916ba18a8596715ef7edbbd00859f3db9eac010dc62a0edd30 \
+    --hash=sha256:af79552276d8bd622048ab2d67ec22120a6af64d83963c46b1482218c27b571f \
+    --hash=sha256:47e18a0521d76293d4f60d129f520b18bddf1976b4a47b50f0fcb04fb6a9d40f \
+    --hash=sha256:31e76ae92e70fec47c3efab0c8094035ad7a578454482415b496fa39fc4d685c && \
     uv pip install -r requirements-lerobot-eval.txt --system
 
 # Pre-download ResNet18 backbone weights (required by ACT policy, no internet at runtime)
diff --git a/evaluation/sil/scripts/download_aml_model.py b/evaluation/sil/scripts/download_aml_model.py
index 334bc084..2834e14d 100644
--- a/evaluation/sil/scripts/download_aml_model.py
+++ b/evaluation/sil/scripts/download_aml_model.py
@@ -16,7 +16,7 @@
 
 model_name = os.environ["AML_MODEL_NAME"]
 model_version = os.environ["AML_MODEL_VERSION"]
-download_dir = Path("/tmp/aml-model")
+download_dir = Path(os.environ.get("AML_DOWNLOAD_DIR", "/tmp/aml-model"))
 download_dir.mkdir(parents=True, exist_ok=True)
 
 print(f"Downloading {model_name}:{model_version}...")
@@ -31,7 +31,8 @@
         model_path = candidate
         break
 
-with open("/tmp/aml_model_path.env", "w") as f:
+config_path = Path(os.environ.get("AML_CONFIG_PATH", "/tmp/aml_model_path.env"))
+with config_path.open("w") as f:
     f.write(f"AML_MODEL_PATH={model_path}\n")
 
 print(f"Model downloaded to: {model_path}")
diff --git a/evaluation/sil/scripts/download_blob_dataset.py b/evaluation/sil/scripts/download_blob_dataset.py
index 1eb4d372..63048517 100644
--- a/evaluation/sil/scripts/download_blob_dataset.py
+++ b/evaluation/sil/scripts/download_blob_dataset.py
@@ -9,7 +9,8 @@
 account = os.environ["BLOB_STORAGE_ACCOUNT"]
 container = os.environ.get("BLOB_STORAGE_CONTAINER", "datasets")
 prefix = os.environ["BLOB_PREFIX"]
-local_root = Path("/workspace/data") / prefix.replace("/", "_")
+data_root = Path(os.environ.get("DATA_ROOT", "/workspace/data"))
+local_root = data_root / prefix.replace("/", "_")
 local_root.mkdir(parents=True, exist_ok=True)
 
 credential = DefaultAzureCredential()
@@ -28,7 +29,8 @@
     with open(local_path, "wb") as f:
         f.write(client.download_blob(blob.name).readall())
 
-with open("/tmp/dataset_path.env", "w") as f:
+config_path = Path(os.environ.get("DATASET_CONFIG_PATH", "/tmp/dataset_path.env"))
+with config_path.open("w") as f:
     f.write(f"DATASET_DIR={local_root}\n")
 
 print(f"Dataset downloaded to: {local_root}")
diff --git a/evaluation/tests/conftest.py b/evaluation/tests/conftest.py
new file mode 100644
index 00000000..d67ebd6b
--- /dev/null
+++ b/evaluation/tests/conftest.py
@@ -0,0 +1,77 @@
+"""Shared fixtures for evaluation tests."""
+
+from __future__ import annotations
+
+import sys
+from unittest.mock import MagicMock
+
+# Stub the cross-package training.rl.simulation_shutdown import used by sil.policy_evaluation
+# so test collection does not require the training package on PYTHONPATH.
+if "training" not in sys.modules:
+    _training = MagicMock()
+    _training.rl = MagicMock()
+    _training.rl.simulation_shutdown = MagicMock()
+    _training.rl.simulation_shutdown.prepare_for_shutdown = MagicMock()
+    sys.modules["training"] = _training
+    sys.modules["training.rl"] = _training.rl
+    sys.modules["training.rl.simulation_shutdown"] = _training.rl.simulation_shutdown
+
+import numpy as np
+import pytest
+from sil.robot_types import IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_JOINTS
+
+
+@pytest.fixture
+def rng() -> np.random.Generator:
+    """Seeded random generator for reproducible tests."""
+    return np.random.default_rng(42)
+
+
+@pytest.fixture
+def joint_positions() -> np.ndarray:
+    """Valid joint position array of shape ``(NUM_JOINTS,)``."""
+    return np.zeros(NUM_JOINTS, dtype=np.float64)
+
+
+@pytest.fixture
+def random_joint_positions(rng: np.random.Generator) -> np.ndarray:
+    """Random joint positions in ``[-pi, pi]``."""
+    return rng.uniform(-np.pi, np.pi, size=(NUM_JOINTS,))
+
+
+@pytest.fixture
+def color_image() -> np.ndarray:
+    """Valid color image array of shape ``(IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)``."""
+    return np.zeros((IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), dtype=np.uint8)
+
+
+@pytest.fixture
+def action_arrays(rng: np.random.Generator) -> tuple[np.ndarray, np.ndarray]:
+    """Predicted and ground truth action delta arrays of shape ``(100, NUM_JOINTS)``."""
+    predicted = rng.normal(0, 0.01, size=(100, NUM_JOINTS))
+    ground_truth = rng.normal(0, 0.01, size=(100, NUM_JOINTS))
+    return predicted, ground_truth
+
+
+@pytest.fixture
+def inference_times(rng: np.random.Generator) -> np.ndarray:
+    """Per-step inference times in seconds, shape ``(100,)``."""
+    return rng.uniform(0.001, 0.01, size=(100,))
+
+
+@pytest.fixture
+def mock_azure_ml(monkeypatch: pytest.MonkeyPatch) -> tuple[MagicMock, MagicMock]:
+    """Inject mock Azure ML and Identity modules into ``sys.modules`` and set env vars."""
+    mock_ml = MagicMock()
+    mock_identity = MagicMock()
+
+    for mod in ("azure", "azure.ai"):
+        monkeypatch.setitem(sys.modules, mod, MagicMock())
+    monkeypatch.setitem(sys.modules, "azure.ai.ml", mock_ml)
+    monkeypatch.setitem(sys.modules, "azure.identity", mock_identity)
+
+    monkeypatch.setenv("AZURE_SUBSCRIPTION_ID", "test-sub-id")
+    monkeypatch.setenv("AZURE_RESOURCE_GROUP", "test-rg")
+    monkeypatch.setenv("AZUREML_WORKSPACE_NAME", "test-ws")
+
+    return mock_ml, mock_identity
diff --git a/evaluation/tests/test_batch_lerobot_eval.py b/evaluation/tests/test_batch_lerobot_eval.py
new file mode 100644
index 00000000..ac45ef61
--- /dev/null
+++ b/evaluation/tests/test_batch_lerobot_eval.py
@@ -0,0 +1,314 @@
+"""Unit tests for ``sil/scripts/batch-lerobot-eval.py``."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+import types
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+# The script imports ``inference.plotting`` at module level.  Provide a
+# lightweight stub so the module can be loaded without the full
+# inference package.
+_inference = types.ModuleType("inference")
+_inference_plotting = types.ModuleType("inference.plotting")
+for _name in (
+    "plot_action_deltas",
+    "plot_aggregate_summary",
+    "plot_cumulative_positions",
+    "plot_error_heatmap",
+    "plot_summary_panel",
+):
+    setattr(_inference_plotting, _name, lambda *a, **kw: None)
+_inference.plotting = _inference_plotting  # type: ignore[attr-defined]
+sys.modules.setdefault("inference", _inference)
+sys.modules.setdefault("inference.plotting", _inference_plotting)
+
+_SCRIPT = Path(__file__).resolve().parents[1] / "sil" / "scripts" / "batch-lerobot-eval.py"
+_spec = importlib.util.spec_from_file_location("batch_lerobot_eval", _SCRIPT)
+assert _spec and _spec.loader
+_mod = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_mod)
+
+parse_episode_range = _mod.parse_episode_range
+run_inference = _mod.run_inference
+plot_episode = _mod.plot_episode
+
+
+class TestParseEpisodeRange:
+    def test_single_number(self) -> None:
+        assert parse_episode_range("5") == [5]
+
+    def test_comma_separated(self) -> None:
+        assert parse_episode_range("0,2,5") == [0, 2, 5]
+
+    def test_range(self) -> None:
+        assert parse_episode_range("1-3") == [1, 2, 3]
+
+    def test_mixed_range_and_numbers(self) -> None:
+        assert parse_episode_range("0,2,5-7") == [0, 2, 5, 6, 7]
+
+    def test_deduplication(self) -> None:
+        assert parse_episode_range("1,1,2") == [1, 2]
+
+    def test_overlapping_range_and_number(self) -> None:
+        assert parse_episode_range("3,1-5") == [1, 2, 3, 4, 5]
+
+    def test_single_element_range(self) -> None:
+        assert parse_episode_range("4-4") == [4]
+
+    def test_result_is_sorted(self) -> None:
+        assert parse_episode_range("9,1,5") == [1, 5, 9]
+
+    def test_whitespace_handling(self) -> None:
+        assert parse_episode_range(" 1 , 3 , 5 ") == [1, 3, 5]
+
+    def test_invalid_value_raises(self) -> None:
+        with pytest.raises(ValueError):
+            parse_episode_range("abc")
+
+
+class TestRunInference:
+    def test_cached_predictions_returned(self, tmp_path: Path) -> None:
+        out_path = tmp_path / "ep001_predictions.npz"
+        out_path.write_bytes(b"cached")
+        result = run_inference(1, "repo", "dataset", "cpu", tmp_path)
+        assert result == out_path
+
+    def test_successful_inference_returns_path(self, tmp_path: Path, monkeypatch) -> None:
+        mock_result = MagicMock(returncode=0, stdout="MSE: 0.01\n", stderr="")
+        monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result)
+        result = run_inference(1, "repo", "dataset", "cpu", tmp_path)
+        assert result == tmp_path / "ep001_predictions.npz"
+
+    def test_failed_inference_returns_none(self, tmp_path: Path, monkeypatch) -> None:
+        mock_result = MagicMock(returncode=1, stdout="", stderr="Error occurred")
+        monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result)
+        result = run_inference(1, "repo", "dataset", "cpu", tmp_path)
+        assert result is None
+
+    def test_metric_lines_printed(self, tmp_path: Path, monkeypatch, capsys) -> None:
+        mock_result = MagicMock(returncode=0, stdout="Loading model...\nMSE: 0.01\nMAE: 0.05\nDone.", stderr="")
+        monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result)
+        run_inference(1, "repo", "dataset", "cpu", tmp_path)
+        captured = capsys.readouterr()
+        assert "MSE: 0.01" in captured.out
+        assert "MAE: 0.05" in captured.out
+
+
+class TestPlotEpisode:
+    @staticmethod
+    def _stub_plotting(monkeypatch) -> MagicMock:
+        mock_fig = MagicMock()
+        for name in (
+            "plot_action_deltas",
+            "plot_cumulative_positions",
+            "plot_error_heatmap",
+            "plot_summary_panel",
+        ):
+            monkeypatch.setattr(_mod, name, lambda *a, _f=mock_fig, **kw: _f)
+        monkeypatch.setattr(_mod.plt, "close", lambda *a: None)
+        return mock_fig
+
+    def test_returns_metrics_dict(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        pred = np.array([[1.0, 2.0], [3.0, 4.0]])
+        gt = np.array([[1.1, 2.1], [3.1, 4.1]])
+        inf_times = np.array([0.01, 0.02])
+        npz_path = tmp_path / "predictions.npz"
+        np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times)
+
+        metrics = plot_episode(1, npz_path, tmp_path, fps=30.0, dpi=150)
+
+        assert metrics is not None
+        assert metrics["episode"] == 1
+        assert metrics["steps"] == 2
+        assert "mse" in metrics
+        assert "mae" in metrics
+        assert "per_joint_mae" in metrics
+        assert "avg_inference_ms" in metrics
+        assert "throughput_hz" in metrics
+
+    def test_creates_episode_directory(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        pred = np.array([[1.0]])
+        gt = np.array([[1.1]])
+        inf_times = np.array([0.01])
+        npz_path = tmp_path / "predictions.npz"
+        np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times)
+
+        plot_episode(5, npz_path, tmp_path, fps=30.0, dpi=150)
+        assert (tmp_path / "episode_005").is_dir()
+
+    def test_metric_values(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        pred = np.array([[0.0, 0.0]])
+        gt = np.array([[1.0, 0.0]])
+        inf_times = np.array([0.01])
+        npz_path = tmp_path / "predictions.npz"
+        np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times)
+
+        metrics = plot_episode(1, npz_path, tmp_path, fps=30.0, dpi=150)
+
+        assert metrics["mse"] == pytest.approx(0.5)
+        assert metrics["mae"] == pytest.approx(0.5)
+        assert metrics["avg_inference_ms"] == pytest.approx(10.0)
+        assert metrics["throughput_hz"] == pytest.approx(100.0)
+
+    def test_zero_inference_time_throughput(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        pred = np.array([[1.0]])
+        gt = np.array([[1.0]])
+        inf_times = np.array([0.0])
+        npz_path = tmp_path / "predictions.npz"
+        np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times)
+
+        metrics = plot_episode(1, npz_path, tmp_path, fps=30.0, dpi=150)
+        assert metrics["throughput_hz"] == 0.0
+
+
+class TestMain:
+    @staticmethod
+    def _stub_plotting(monkeypatch) -> MagicMock:
+        mock_fig = MagicMock()
+        for name in (
+            "plot_action_deltas",
+            "plot_cumulative_positions",
+            "plot_error_heatmap",
+            "plot_summary_panel",
+            "plot_aggregate_summary",
+        ):
+            monkeypatch.setattr(_mod, name, lambda *a, _f=mock_fig, **kw: _f)
+        monkeypatch.setattr(_mod.plt, "close", lambda *a: None)
+        return mock_fig
+
+    def test_plot_only_generates_metrics_json(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        pred = np.array([[1.0, 2.0]])
+        gt = np.array([[1.1, 2.1]])
+        inf_times = np.array([0.01])
+        for ep in (1, 2):
+            np.savez(
+                tmp_path / f"ep{ep:03d}_predictions.npz",
+                predicted=pred,
+                ground_truth=gt,
+                inference_times=inf_times,
+            )
+
+        out_dir = tmp_path / "output"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "batch-lerobot-eval",
+                "--plot-only",
+                "--npz-dir",
+                str(tmp_path),
+                "--episodes",
+                "1-2",
+                "--output-dir",
+                str(out_dir),
+            ],
+        )
+        _mod.main()
+
+        metrics_path = out_dir / "eval_metrics.json"
+        assert metrics_path.exists()
+        metrics = json.loads(metrics_path.read_text())
+        assert len(metrics) == 2
+
+    def test_missing_npz_skipped_in_plot_only(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        out_dir = tmp_path / "output"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "batch-lerobot-eval",
+                "--plot-only",
+                "--npz-dir",
+                str(tmp_path),
+                "--episodes",
+                "1",
+                "--output-dir",
+                str(out_dir),
+            ],
+        )
+        _mod.main()
+        assert not (out_dir / "eval_metrics.json").exists()
+
+    def test_requires_policy_and_dataset_without_plot_only(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "batch-lerobot-eval",
+                "--episodes",
+                "1",
+                "--output-dir",
+                str(tmp_path),
+            ],
+        )
+        with pytest.raises(SystemExit):
+            _mod.main()
+
+    def test_plot_episode_returning_none_continues_loop(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        monkeypatch.setattr(_mod, "plot_episode", lambda *a, **kw: None)
+        pred = np.array([[1.0, 2.0]])
+        gt = np.array([[1.1, 2.1]])
+        inf_times = np.array([0.01])
+        for ep in (1, 2):
+            np.savez(
+                tmp_path / f"ep{ep:03d}_predictions.npz",
+                predicted=pred,
+                ground_truth=gt,
+                inference_times=inf_times,
+            )
+
+        out_dir = tmp_path / "output"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "batch-lerobot-eval",
+                "--plot-only",
+                "--npz-dir",
+                str(tmp_path),
+                "--episodes",
+                "1-2",
+                "--output-dir",
+                str(out_dir),
+            ],
+        )
+        _mod.main()
+        assert not (out_dir / "eval_metrics.json").exists()
+
+    def test_inference_failure_skips_plotting(self, tmp_path: Path, monkeypatch) -> None:
+        self._stub_plotting(monkeypatch)
+        mock_result = MagicMock(returncode=1, stdout="", stderr="Error")
+        monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result)
+
+        out_dir = tmp_path / "output"
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "batch-lerobot-eval",
+                "--policy-repo",
+                "repo",
+                "--dataset-dir",
+                str(tmp_path),
+                "--episodes",
+                "1-2",
+                "--output-dir",
+                str(out_dir),
+            ],
+        )
+        _mod.main()
+        assert not (out_dir / "eval_metrics.json").exists()
diff --git a/evaluation/tests/test_bootstrap_mlflow.py b/evaluation/tests/test_bootstrap_mlflow.py
new file mode 100644
index 00000000..c0969087
--- /dev/null
+++ b/evaluation/tests/test_bootstrap_mlflow.py
@@ -0,0 +1,75 @@
+"""Unit tests for ``metrics.bootstrap_mlflow`` module-level script."""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+_EVAL_ROOT = Path(__file__).resolve().parent.parent
+_SCRIPT_PATH = _EVAL_ROOT / "metrics" / "bootstrap_mlflow.py"
+
+
+class TestBootstrapMlflow:
+    """Execute the bootstrap script via importlib with mocked Azure and MLflow."""
+
+    @pytest.fixture(autouse=True)
+    def _setup(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+        mock_azure_ml: tuple[MagicMock, MagicMock],
+        tmp_path: Path,
+    ) -> None:
+        mock_ml, _ = mock_azure_ml
+        self.mock_mlflow = MagicMock()
+        monkeypatch.setitem(sys.modules, "mlflow", self.mock_mlflow)
+
+        self.mock_workspace = MagicMock()
+        self.mock_workspace.mlflow_tracking_uri = "azureml://test-tracking"
+        mock_ml.MLClient.return_value.workspaces.get.return_value = self.mock_workspace
+
+        self.config_path = tmp_path / "mlflow_config.env"
+        monkeypatch.setenv("MLFLOW_CONFIG_PATH", str(self.config_path))
+
+    def _load_script(self) -> None:
+        spec = importlib.util.spec_from_file_location("bootstrap_mlflow", _SCRIPT_PATH)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+
+    def test_writes_config_with_tracking_uri(self) -> None:
+        self._load_script()
+        content = self.config_path.read_text()
+        assert "MLFLOW_TRACKING_URI=azureml://test-tracking\n" in content
+
+    def test_default_experiment_name_uses_policy_type(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        monkeypatch.setenv("POLICY_TYPE", "diffusion")
+        self._load_script()
+        content = self.config_path.read_text()
+        assert "MLFLOW_EXPERIMENT_NAME=lerobot-diffusion-inference\n" in content
+
+    def test_custom_experiment_name(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setenv("EXPERIMENT_NAME", "my-experiment")
+        self._load_script()
+        content = self.config_path.read_text()
+        assert "MLFLOW_EXPERIMENT_NAME=my-experiment\n" in content
+
+    def test_none_experiment_falls_back_to_default(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        monkeypatch.setenv("EXPERIMENT_NAME", "none")
+        self._load_script()
+        content = self.config_path.read_text()
+        assert "MLFLOW_EXPERIMENT_NAME=lerobot-act-inference\n" in content
+
+    def test_missing_tracking_uri_exits(self) -> None:
+        self.mock_workspace.mlflow_tracking_uri = None
+        with pytest.raises(SystemExit) as exc_info:
+            self._load_script()
+        assert exc_info.value.code == 1
diff --git a/evaluation/tests/test_download_aml_model.py b/evaluation/tests/test_download_aml_model.py
new file mode 100644
index 00000000..c8c6b358
--- /dev/null
+++ b/evaluation/tests/test_download_aml_model.py
@@ -0,0 +1,95 @@
+"""Unit tests for ``sil.scripts.download_aml_model`` module-level script."""
+
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+_EVAL_ROOT = Path(__file__).resolve().parent.parent
+_SCRIPT_PATH = _EVAL_ROOT / "sil" / "scripts" / "download_aml_model.py"
+
+
+class TestDownloadAmlModel:
+    """Execute the download script via importlib with mocked Azure SDK."""
+
+    @pytest.fixture(autouse=True)
+    def _setup(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+        mock_azure_ml: tuple[MagicMock, MagicMock],
+        tmp_path: Path,
+    ) -> None:
+        mock_ml, _ = mock_azure_ml
+        self.mock_client = mock_ml.MLClient.return_value
+
+        monkeypatch.setenv("AML_MODEL_NAME", "test-model")
+        monkeypatch.setenv("AML_MODEL_VERSION", "3")
+
+        self.download_dir = tmp_path / "aml-model"
+        self.config_path = tmp_path / "aml_model_path.env"
+        monkeypatch.setenv("AML_DOWNLOAD_DIR", str(self.download_dir))
+        monkeypatch.setenv("AML_CONFIG_PATH", str(self.config_path))
+
+    def _load_script(self) -> None:
+        spec = importlib.util.spec_from_file_location("download_aml_model", _SCRIPT_PATH)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+
+    def test_calls_download_with_model_info(self) -> None:
+        model_dir = self.download_dir / "test-model"
+        model_dir.mkdir(parents=True, exist_ok=True)
+        (model_dir / "weights.safetensors").write_bytes(b"\x00" * 64)
+
+        self._load_script()
+
+        self.mock_client.models.download.assert_called_once_with(
+            name="test-model",
+            version="3",
+            download_path=str(self.download_dir),
+        )
+
+    def test_writes_config_env(self) -> None:
+        model_dir = self.download_dir / "test-model"
+        model_dir.mkdir(parents=True, exist_ok=True)
+        (model_dir / "weights.safetensors").write_bytes(b"\x00" * 64)
+
+        self._load_script()
+
+        content = self.config_path.read_text()
+        assert content.startswith("AML_MODEL_PATH=")
+
+    def test_finds_safetensors_directory(self) -> None:
+        model_dir = self.download_dir / "test-model"
+        model_dir.mkdir(parents=True, exist_ok=True)
+        sub = model_dir / "checkpoint"
+        sub.mkdir()
+        (sub / "model.safetensors").write_bytes(b"\x00" * 64)
+
+        self._load_script()
+
+        content = self.config_path.read_text()
+        assert "checkpoint" in content
+
+    def test_finds_bin_directory(self) -> None:
+        model_dir = self.download_dir / "test-model"
+        model_dir.mkdir(parents=True, exist_ok=True)
+        sub = model_dir / "ckpt"
+        sub.mkdir()
+        (sub / "pytorch_model.bin").write_bytes(b"\x00" * 64)
+
+        self._load_script()
+
+        content = self.config_path.read_text()
+        assert "ckpt" in content
+
+    def test_falls_back_to_download_dir_when_no_model_name_dir(self) -> None:
+        self.download_dir.mkdir(parents=True, exist_ok=True)
+        (self.download_dir / "random_file.txt").write_bytes(b"\x00" * 64)
+
+        self._load_script()
+
+        content = self.config_path.read_text()
+        assert f"AML_MODEL_PATH={self.download_dir}" in content
diff --git a/evaluation/tests/test_download_blob_dataset.py b/evaluation/tests/test_download_blob_dataset.py
new file mode 100644
index 00000000..39425df3
--- /dev/null
+++ b/evaluation/tests/test_download_blob_dataset.py
@@ -0,0 +1,88 @@
+"""Unit tests for ``sil.scripts.download_blob_dataset`` module-level script."""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+_EVAL_ROOT = Path(__file__).resolve().parent.parent
+_SCRIPT_PATH = _EVAL_ROOT / "sil" / "scripts" / "download_blob_dataset.py"
+
+
+class TestDownloadBlobDataset:
+    """Execute the download script with mocked Azure SDK and redirected output paths."""
+
+    @pytest.fixture(autouse=True)
+    def _setup(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        # Mock azure.identity and azure.storage.blob via sys.modules.
+        mock_identity = MagicMock()
+        mock_blob = MagicMock()
+
+        blob_a = MagicMock()
+        blob_a.name = "myprefix/sub/file_a.bin"
+        blob_b = MagicMock()
+        blob_b.name = "myprefix/file_b.txt"
+        # Empty rel-path entry should be skipped.
+        blob_skip = MagicMock()
+        blob_skip.name = "myprefix/"
+
+        self.client = MagicMock()
+        self.client.list_blobs.return_value = [blob_a, blob_b, blob_skip]
+        download_stream = MagicMock()
+        download_stream.readall.return_value = b"data-bytes"
+        self.client.download_blob.return_value = download_stream
+
+        mock_blob.ContainerClient.from_container_url.return_value = self.client
+        self.mock_blob = mock_blob
+        self.mock_identity = mock_identity
+
+        monkeypatch.setitem(sys.modules, "azure", MagicMock())
+        monkeypatch.setitem(sys.modules, "azure.identity", mock_identity)
+        monkeypatch.setitem(sys.modules, "azure.storage", MagicMock())
+        monkeypatch.setitem(sys.modules, "azure.storage.blob", mock_blob)
+
+        monkeypatch.setenv("BLOB_STORAGE_ACCOUNT", "myacct")
+        monkeypatch.setenv("BLOB_PREFIX", "myprefix")
+        monkeypatch.delenv("BLOB_STORAGE_CONTAINER", raising=False)
+
+        self.data_root = tmp_path / "workspace_data"
+        self.config_path = tmp_path / "dataset_path.env"
+        self.local_root = self.data_root / "myprefix"
+        monkeypatch.setenv("DATA_ROOT", str(self.data_root))
+        monkeypatch.setenv("DATASET_CONFIG_PATH", str(self.config_path))
+
+    def _run(self) -> None:
+        spec = importlib.util.spec_from_file_location("download_blob_dataset", _SCRIPT_PATH)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+
+    def test_default_container_used(self) -> None:
+        self._run()
+        url = self.mock_blob.ContainerClient.from_container_url.call_args[0][0]
+        assert url == "https://myacct.blob.core.windows.net/datasets"
+
+    def test_custom_container_env(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setenv("BLOB_STORAGE_CONTAINER", "custom-ctr")
+        self._run()
+        url = self.mock_blob.ContainerClient.from_container_url.call_args[0][0]
+        assert url.endswith("/custom-ctr")
+
+    def test_writes_files_and_skips_empty_rel(self) -> None:
+        self._run()
+        assert (self.local_root / "sub" / "file_a.bin").read_bytes() == b"data-bytes"
+        assert (self.local_root / "file_b.txt").read_bytes() == b"data-bytes"
+        downloaded = [c.args[0] for c in self.client.download_blob.call_args_list]
+        assert "myprefix/" not in downloaded
+
+    def test_writes_config_env(self) -> None:
+        self._run()
+        content = self.config_path.read_text()
+        assert content == f"DATASET_DIR={self.local_root}\n"
+
+    def test_uses_default_credential(self) -> None:
+        self._run()
+        self.mock_identity.DefaultAzureCredential.assert_called_once()
diff --git a/evaluation/tests/test_plot_lerobot_trajectories.py b/evaluation/tests/test_plot_lerobot_trajectories.py
new file mode 100644
index 00000000..61a9839e
--- /dev/null
+++ b/evaluation/tests/test_plot_lerobot_trajectories.py
@@ -0,0 +1,112 @@
+"""Tests for metrics/plot-lerobot-trajectories.py."""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+import types
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+_SCRIPT_PATH = _REPO_ROOT / "metrics" / "plot-lerobot-trajectories.py"
+
+_inference_pkg = types.ModuleType("inference")
+_inference_pkg.__path__ = []  # type: ignore[attr-defined]
+_plotting_mod = types.ModuleType("inference.plotting")
+for _name in (
+    "plot_action_deltas",
+    "plot_cumulative_positions",
+    "plot_error_heatmap",
+    "plot_summary_panel",
+):
+    setattr(_plotting_mod, _name, lambda *a, **kw: None)
+sys.modules.setdefault("inference", _inference_pkg)
+sys.modules["inference.plotting"] = _plotting_mod
+
+_spec = importlib.util.spec_from_file_location("plot_lerobot_trajectories", _SCRIPT_PATH)
+assert _spec is not None and _spec.loader is not None
+_mod = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_mod)
+
+
+def _stub_plotting(monkeypatch) -> MagicMock:
+    mock_fig = MagicMock()
+    for name in (
+        "plot_action_deltas",
+        "plot_cumulative_positions",
+        "plot_error_heatmap",
+        "plot_summary_panel",
+    ):
+        monkeypatch.setattr(_mod, name, lambda *a, _f=mock_fig, **kw: _f)
+    monkeypatch.setattr(_mod.plt, "close", lambda *a: None)
+    return mock_fig
+
+
+def _write_npz(path: Path) -> None:
+    np.savez(
+        path,
+        predicted=np.array([[1.0, 2.0], [3.0, 4.0]]),
+        ground_truth=np.array([[1.1, 2.1], [3.1, 4.1]]),
+        inference_times=np.array([0.01, 0.02]),
+    )
+
+
+class TestMain:
+    def test_missing_file_exits(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "plot-lerobot-trajectories",
+                str(tmp_path / "nope.npz"),
+            ],
+        )
+        with pytest.raises(SystemExit) as exc_info:
+            _mod.main()
+        assert exc_info.value.code == 1
+
+    def test_default_output_dir_creates_sibling(self, tmp_path: Path, monkeypatch) -> None:
+        _stub_plotting(monkeypatch)
+        npz_path = tmp_path / "predictions.npz"
+        _write_npz(npz_path)
+
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "plot-lerobot-trajectories",
+                str(npz_path),
+            ],
+        )
+        _mod.main()
+        assert (tmp_path / "trajectory_plots").is_dir()
+
+    def test_custom_output_dir_used(self, tmp_path: Path, monkeypatch) -> None:
+        mock_fig = _stub_plotting(monkeypatch)
+        npz_path = tmp_path / "predictions.npz"
+        _write_npz(npz_path)
+        out_dir = tmp_path / "custom_out"
+
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "plot-lerobot-trajectories",
+                str(npz_path),
+                "--output-dir",
+                str(out_dir),
+                "--episode",
+                "5",
+                "--fps",
+                "60.0",
+                "--dpi",
+                "200",
+            ],
+        )
+        _mod.main()
+        assert out_dir.is_dir()
+        assert mock_fig.savefig.call_count == 4
diff --git a/evaluation/tests/test_plotting.py b/evaluation/tests/test_plotting.py
new file mode 100644
index 00000000..8ca04480
--- /dev/null
+++ b/evaluation/tests/test_plotting.py
@@ -0,0 +1,147 @@
+"""Unit tests for ``metrics.plotting``."""
+
+from __future__ import annotations
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pytest
+from metrics.plotting import (
+    JOINT_NAMES,
+    plot_action_deltas,
+    plot_aggregate_summary,
+    plot_cumulative_positions,
+    plot_error_heatmap,
+    plot_summary_panel,
+)
+from sil.robot_types import NUM_JOINTS
+
+
+@pytest.fixture(autouse=True)
+def _close_figures():
+    """Close all matplotlib figures after each test to prevent memory leaks."""
+    yield
+    plt.close("all")
+
+
+class TestPlotActionDeltas:
+    def test_returns_figure(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_action_deltas(predicted, ground_truth, episode=1, fps=30.0)
+        assert isinstance(fig, plt.Figure)
+
+    def test_subplot_count(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_action_deltas(predicted, ground_truth, episode=1, fps=30.0)
+        assert len(fig.axes) == NUM_JOINTS
+
+    def test_custom_joint_names(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        names = [f"j{i}" for i in range(NUM_JOINTS)]
+        fig = plot_action_deltas(predicted, ground_truth, episode=1, fps=30.0, joint_names=names)
+        assert isinstance(fig, plt.Figure)
+
+
+class TestPlotCumulativePositions:
+    def test_returns_figure(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_cumulative_positions(predicted, ground_truth, episode=2, fps=30.0)
+        assert isinstance(fig, plt.Figure)
+
+    def test_subplot_count(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_cumulative_positions(predicted, ground_truth, episode=2, fps=30.0)
+        assert len(fig.axes) == NUM_JOINTS
+
+
+class TestPlotErrorHeatmap:
+    def test_returns_figure(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_error_heatmap(predicted, ground_truth, episode=3, fps=30.0)
+        assert isinstance(fig, plt.Figure)
+
+    def test_single_axis(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_error_heatmap(predicted, ground_truth, episode=3, fps=30.0)
+        # One heatmap axes + one colorbar axes.
+        assert len(fig.axes) == 2
+
+
+class TestPlotSummaryPanel:
+    def test_returns_figure(
+        self,
+        action_arrays: tuple[np.ndarray, np.ndarray],
+        inference_times: np.ndarray,
+    ) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_summary_panel(predicted, ground_truth, inference_times, episode=4, fps=30.0)
+        assert isinstance(fig, plt.Figure)
+
+    def test_2x2_layout(
+        self,
+        action_arrays: tuple[np.ndarray, np.ndarray],
+        inference_times: np.ndarray,
+    ) -> None:
+        predicted, ground_truth = action_arrays
+        fig = plot_summary_panel(predicted, ground_truth, inference_times, episode=4, fps=30.0)
+        assert len(fig.axes) == 4
+
+    def test_requires_inference_times(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None:
+        predicted, ground_truth = action_arrays
+        with pytest.raises(TypeError):
+            plot_summary_panel(predicted, ground_truth, episode=4, fps=30.0)  # type: ignore[call-arg]
+
+
+def test_joint_names_default_length() -> None:
+    assert len(JOINT_NAMES) == NUM_JOINTS
+
+
+class TestPlotAggregateSummary:
+    @pytest.fixture()
+    def episode_metrics(self) -> list[dict]:
+        return [
+            {
+                "episode": 1,
+                "mse": 0.01,
+                "mae": 0.05,
+                "throughput_hz": 120.0,
+                "avg_inference_ms": 8.3,
+                "per_joint_mae": [0.04, 0.05, 0.06, 0.03, 0.07, 0.02],
+            },
+            {
+                "episode": 2,
+                "mse": 0.02,
+                "mae": 0.08,
+                "throughput_hz": 110.0,
+                "avg_inference_ms": 9.1,
+                "per_joint_mae": [0.05, 0.06, 0.07, 0.04, 0.08, 0.03],
+            },
+            {
+                "episode": 3,
+                "mse": 0.005,
+                "mae": 0.03,
+                "throughput_hz": 130.0,
+                "avg_inference_ms": 7.7,
+                "per_joint_mae": [0.03, 0.04, 0.05, 0.02, 0.06, 0.01],
+            },
+        ]
+
+    def test_returns_figure(self, episode_metrics) -> None:
+        fig = plot_aggregate_summary(episode_metrics)
+        assert isinstance(fig, plt.Figure)
+
+    def test_2x2_layout(self, episode_metrics) -> None:
+        fig = plot_aggregate_summary(episode_metrics)
+        assert len(fig.axes) == 4
+
+    def test_custom_joint_names(self, episode_metrics) -> None:
+        names = ["j1", "j2", "j3", "j4", "j5", "j6"]
+        fig = plot_aggregate_summary(episode_metrics, joint_names=names)
+        ax_top_right = fig.axes[1]
+        tick_labels = [t.get_text() for t in ax_top_right.get_xticklabels()]
+        assert tick_labels == names
+
+    def test_uses_default_joint_names(self, episode_metrics) -> None:
+        fig = plot_aggregate_summary(episode_metrics)
+        ax_top_right = fig.axes[1]
+        tick_labels = [t.get_text() for t in ax_top_right.get_xticklabels()]
+        assert tick_labels == list(JOINT_NAMES)
diff --git a/evaluation/tests/test_policy_evaluation.py b/evaluation/tests/test_policy_evaluation.py
new file mode 100644
index 00000000..d317bd40
--- /dev/null
+++ b/evaluation/tests/test_policy_evaluation.py
@@ -0,0 +1,533 @@
+"""Unit tests for ``sil.policy_evaluation``."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+torch = pytest.importorskip("torch")
+
+from sil import policy_evaluation  # noqa: E402
+from sil.policy_evaluation import (  # noqa: E402
+    Metrics,
+    ModelMetadata,
+    _build_parser,
+    _load_rsl_rl,
+    _load_skrl,
+    evaluate,
+    find_checkpoint,
+    load_agent,
+    load_metadata,
+    main,
+)
+
+
+class TestModelMetadata:
+    def test_defaults(self) -> None:
+        meta = ModelMetadata()
+        assert meta.task == ""
+        assert meta.framework == "skrl"
+        assert meta.success_threshold == 0.7
+
+    def test_custom_values(self) -> None:
+        meta = ModelMetadata(task="Lift-v0", framework="rsl_rl", success_threshold=0.9)
+        assert meta.task == "Lift-v0"
+        assert meta.framework == "rsl_rl"
+        assert meta.success_threshold == 0.9
+
+
+class TestLoadMetadata:
+    def test_auto_task_becomes_empty(self) -> None:
+        meta = load_metadata(task="auto", framework="skrl", success_threshold=0.7)
+        assert meta.task == ""
+
+    def test_empty_task_stays_empty(self) -> None:
+        meta = load_metadata(task="", framework="skrl", success_threshold=0.7)
+        assert meta.task == ""
+
+    def test_explicit_task_preserved(self) -> None:
+        meta = load_metadata(task="Reach-v0", framework="rsl_rl", success_threshold=0.5)
+        assert meta.task == "Reach-v0"
+        assert meta.framework == "rsl_rl"
+        assert meta.success_threshold == 0.5
+
+    def test_negative_threshold_uses_default(self) -> None:
+        meta = load_metadata(task="Lift-v0", framework="skrl", success_threshold=-1.0)
+        assert meta.success_threshold == 0.7
+
+    def test_zero_threshold_kept(self) -> None:
+        meta = load_metadata(task="Lift-v0", framework="skrl", success_threshold=0.0)
+        assert meta.success_threshold == 0.0
+
+    def test_auto_framework_becomes_default(self) -> None:
+        meta = load_metadata(task="Lift-v0", framework="auto", success_threshold=0.5)
+        assert meta.framework == "skrl"
+
+    def test_empty_framework_becomes_default(self) -> None:
+        meta = load_metadata(task="Lift-v0", framework="", success_threshold=0.5)
+        assert meta.framework == "skrl"
+
+
+class TestMetrics:
+    def test_empty_to_dict_returns_error(self) -> None:
+        m = Metrics()
+        result = m.to_dict()
+        assert "error" in result
+        assert result["error"] == "No episodes completed"
+
+    def test_count_starts_at_zero(self) -> None:
+        m = Metrics()
+        assert m.count == 0
+
+    def test_add_increments_count(self) -> None:
+        m = Metrics()
+        m.add(reward=10.0, length=50, success=True)
+        assert m.count == 1
+        m.add(reward=20.0, length=60, success=False)
+        assert m.count == 2
+
+    def test_to_dict_single_episode(self) -> None:
+        m = Metrics()
+        m.add(reward=10.0, length=50, success=True)
+        result = m.to_dict()
+        assert result["eval_episodes"] == 1
+        assert result["mean_reward"] == 10.0
+        assert result["std_reward"] == 0.0
+        assert result["mean_length"] == 50.0
+        assert result["success_rate"] == 1.0
+
+    def test_to_dict_multiple_episodes(self) -> None:
+        m = Metrics()
+        m.add(reward=10.0, length=50, success=True)
+        m.add(reward=20.0, length=100, success=False)
+        result = m.to_dict()
+        assert result["eval_episodes"] == 2
+        assert result["mean_reward"] == pytest.approx(15.0)
+        assert result["mean_length"] == pytest.approx(75.0)
+        assert result["success_rate"] == pytest.approx(0.5)
+
+    def test_rewards_and_lengths_tracked(self) -> None:
+        m = Metrics()
+        m.add(reward=5.0, length=10, success=False)
+        m.add(reward=15.0, length=30, success=True)
+        assert m.rewards == [5.0, 15.0]
+        assert m.lengths == [10, 30]
+        assert m.successes == 1
+
+
+class TestFindCheckpoint:
+    def test_file_with_pt_extension(self, tmp_path: Path) -> None:
+        ckpt = tmp_path / "model.pt"
+        ckpt.write_bytes(b"fake")
+        assert find_checkpoint(str(ckpt)) == str(ckpt)
+
+    def test_file_with_pth_extension(self, tmp_path: Path) -> None:
+        ckpt = tmp_path / "model.pth"
+        ckpt.write_bytes(b"fake")
+        assert find_checkpoint(str(ckpt)) == str(ckpt)
+
+    def test_bad_extension_raises(self, tmp_path: Path) -> None:
+        bad = tmp_path / "model.onnx"
+        bad.write_bytes(b"fake")
+        with pytest.raises(FileNotFoundError):
+            find_checkpoint(str(bad))
+
+    def test_nonexistent_file_raises(self) -> None:
+        with pytest.raises(FileNotFoundError):
+            find_checkpoint("/tmp/nonexistent_checkpoint.pt")
+
+    def test_directory_finds_best_agent(self, tmp_path: Path) -> None:
+        best = tmp_path / "best_agent.pt"
+        best.write_bytes(b"best")
+        assert find_checkpoint(str(tmp_path)) == str(best)
+
+    def test_directory_finds_checkpoint_subdir(self, tmp_path: Path) -> None:
+        ckpt_dir = tmp_path / "checkpoints"
+        ckpt_dir.mkdir()
+        ckpt = ckpt_dir / "step_1000.pt"
+        ckpt.write_bytes(b"data")
+        assert find_checkpoint(str(tmp_path)) == str(ckpt)
+
+    def test_directory_prefers_best_agent_over_glob(self, tmp_path: Path) -> None:
+        best = tmp_path / "best_agent.pt"
+        best.write_bytes(b"best")
+        other = tmp_path / "other.pt"
+        other.write_bytes(b"other")
+        assert find_checkpoint(str(tmp_path)) == str(best)
+
+    def test_directory_selects_newest_by_mtime(self, tmp_path: Path) -> None:
+        import time
+
+        old = tmp_path / "old.pt"
+        old.write_bytes(b"old")
+        time.sleep(0.05)
+        new = tmp_path / "new.pt"
+        new.write_bytes(b"new")
+        result = find_checkpoint(str(tmp_path))
+        assert result == str(new)
+
+    def test_empty_directory_raises(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            find_checkpoint(str(tmp_path))
+
+
+class TestBuildParser:
+    def test_returns_parser(self) -> None:
+        parser = _build_parser()
+        assert isinstance(parser, argparse.ArgumentParser)
+
+    def test_model_path_required(self) -> None:
+        parser = _build_parser()
+        with pytest.raises(SystemExit):
+            parser.parse_args([])
+
+    def test_defaults(self) -> None:
+        parser = _build_parser()
+        args = parser.parse_args(["--model-path", "/tmp/model"])
+        assert args.model_path == "/tmp/model"
+        assert args.task == ""
+        assert args.framework == ""
+        assert args.eval_episodes == 100
+        assert args.num_envs == 64
+        assert args.success_threshold == -1
+        assert args.headless is False
+        assert args.seed == 42
+
+    def test_all_flags(self) -> None:
+        parser = _build_parser()
+        args = parser.parse_args(
+            [
+                "--model-path",
+                "/m",
+                "--task",
+                "Reach-v0",
+                "--framework",
+                "rsl_rl",
+                "--eval-episodes",
+                "50",
+                "--num-envs",
+                "32",
+                "--success-threshold",
+                "0.8",
+                "--headless",
+                "--seed",
+                "99",
+            ]
+        )
+        assert args.task == "Reach-v0"
+        assert args.framework == "rsl_rl"
+        assert args.eval_episodes == 50
+        assert args.num_envs == 32
+        assert args.success_threshold == 0.8
+        assert args.headless is True
+        assert args.seed == 99
+
+
+class TestLoadAgent:
+    def test_unsupported_framework_raises(self) -> None:
+        with pytest.raises(ValueError, match="Unsupported framework"):
+            load_agent("/tmp/ckpt.pt", "tensorflow", "Reach-v0", MagicMock(), "cuda")
+
+    def test_dispatches_to_skrl_loader(self) -> None:
+        sentinel = object()
+        with patch("sil.policy_evaluation._load_skrl", return_value=sentinel) as mock:
+            result = load_agent("/tmp/ckpt.pt", "skrl", "Reach-v0", MagicMock(), "cuda")
+        assert result is sentinel
+        mock.assert_called_once()
+
+    def test_dispatches_to_rsl_rl_loader(self) -> None:
+        sentinel = object()
+        with patch("sil.policy_evaluation._load_rsl_rl", return_value=sentinel) as mock:
+            result = load_agent("/tmp/ckpt.pt", "rsl_rl", "Reach-v0", MagicMock(), "cpu")
+        assert result is sentinel
+        mock.assert_called_once_with("/tmp/ckpt.pt", "cpu")
+
+
+class TestLoadRslRl:
+    def test_loads_actor_critic_and_returns_eval_policy(self) -> None:
+        rsl_rl_modules = MagicMock()
+        policy = MagicMock()
+        policy.to.return_value = policy
+        rsl_rl_modules.ActorCritic.return_value = policy
+        checkpoint = {"model_cfg": {"a": 1}, "model_state_dict": {"w": 0}}
+
+        with (
+            patch.dict(sys.modules, {"rsl_rl": MagicMock(), "rsl_rl.modules": rsl_rl_modules}),
+            patch("sil.policy_evaluation.torch.load", return_value=checkpoint) as mock_load,
+        ):
+            result = _load_rsl_rl("/tmp/ckpt.pt", "cpu")
+
+        mock_load.assert_called_once_with("/tmp/ckpt.pt", map_location="cpu", weights_only=False)
+        rsl_rl_modules.ActorCritic.assert_called_once_with(a=1)
+        policy.load_state_dict.assert_called_once_with({"w": 0})
+        policy.eval.assert_called_once()
+        policy.to.assert_called_once_with("cpu")
+        assert result is policy
+
+
+class _StubEnv:
+    """Minimal env stub backed by torch tensors for evaluate() loop."""
+
+    def __init__(self, num_envs: int, episode_len: int, success: bool = True) -> None:
+        self.num_envs = num_envs
+        self.device = "cpu"
+        self._episode_len = episode_len
+        self._step_count = 0
+        self._success = success
+
+    def reset(self) -> tuple[torch.Tensor, dict]:
+        self._step_count = 0
+        return torch.zeros(self.num_envs, 4), {}
+
+    def step(self, actions):
+        self._step_count += 1
+        rewards = torch.ones(self.num_envs, 1)
+        done = self._step_count >= self._episode_len
+        terminated = torch.full((self.num_envs, 1), done, dtype=torch.bool)
+        truncated = torch.zeros(self.num_envs, 1, dtype=torch.bool)
+        info = {"success": torch.full((self.num_envs,), self._success, dtype=torch.bool)}
+        if done:
+            self._step_count = 0
+        return torch.zeros(self.num_envs, 4), rewards, terminated, truncated, info
+
+
+class TestEvaluate:
+    def test_skrl_path_collects_metrics(self) -> None:
+        env = _StubEnv(num_envs=2, episode_len=3, success=True)
+        agent = MagicMock()
+        agent.act.return_value = (torch.zeros(2, 1),)
+
+        metrics = evaluate(env, agent, num_episodes=2, framework="skrl")
+
+        assert metrics.count == 2
+        assert metrics.successes == 2
+        assert all(r == pytest.approx(3.0) for r in metrics.rewards)
+        assert metrics.lengths == [3, 3]
+        agent.act.assert_called()
+
+    def test_rsl_rl_path_uses_act_inference(self) -> None:
+        env = _StubEnv(num_envs=2, episode_len=2, success=False)
+        agent = MagicMock()
+        agent.act_inference.return_value = torch.zeros(2, 1)
+
+        metrics = evaluate(env, agent, num_episodes=2, framework="rsl_rl")
+
+        assert metrics.count == 2
+        assert metrics.successes == 0
+        agent.act_inference.assert_called()
+        agent.act.assert_not_called()
+
+    def test_progress_logging_at_multiples_of_twenty(self) -> None:
+        env = _StubEnv(num_envs=20, episode_len=1, success=True)
+        agent = MagicMock()
+        agent.act.return_value = (torch.zeros(20, 1),)
+
+        metrics = evaluate(env, agent, num_episodes=20, framework="skrl")
+        assert metrics.count == 20
+        assert metrics.successes == 20
+
+    def test_break_when_count_reaches_num_episodes_mid_step(self) -> None:
+        # num_envs (3) > num_episodes (2) so the third done index in a single
+        # step trips the early-exit guard inside the done_indices loop.
+        env = _StubEnv(num_envs=3, episode_len=1, success=True)
+        agent = MagicMock()
+        agent.act.return_value = (torch.zeros(3, 1),)
+        metrics = evaluate(env, agent, num_episodes=2, framework="skrl")
+        assert metrics.count == 2
+
+    def test_truncated_episode_not_counted_as_success(self) -> None:
+        class TruncEnv(_StubEnv):
+            def step(self, actions):
+                self._step_count += 1
+                rewards = torch.ones(self.num_envs, 1)
+                terminated = torch.zeros(self.num_envs, 1, dtype=torch.bool)
+                truncated = torch.ones(self.num_envs, 1, dtype=torch.bool)
+                info = {"success": torch.ones(self.num_envs, dtype=torch.bool)}
+                return torch.zeros(self.num_envs, 4), rewards, terminated, truncated, info
+
+        env = TruncEnv(num_envs=2, episode_len=1)
+        agent = MagicMock()
+        agent.act.return_value = (torch.zeros(2, 1),)
+
+        metrics = evaluate(env, agent, num_episodes=2, framework="skrl")
+        assert metrics.count == 2
+        assert metrics.successes == 0
+
+
+def _skrl_module_stubs(decorator_calls_inner: bool = True, cfg: object | None = None):
+    """Build sys.modules patch dict for _load_skrl tests."""
+    hydra_mod = MagicMock()
+    if decorator_calls_inner:
+        hydra_mod.hydra_task_config = lambda task, entry: lambda fn: lambda: fn(None, cfg)
+    else:
+        hydra_mod.hydra_task_config = lambda task, entry: lambda fn: lambda: None
+    runner_mod = MagicMock()
+    runner_mod.Runner = MagicMock()
+    return {
+        "isaaclab_tasks": MagicMock(),
+        "isaaclab_tasks.utils": MagicMock(),
+        "isaaclab_tasks.utils.hydra": hydra_mod,
+        "skrl": MagicMock(),
+        "skrl.utils": MagicMock(),
+        "skrl.utils.runner": MagicMock(),
+        "skrl.utils.runner.torch": runner_mod,
+    }, runner_mod
+
+
+class TestLoadSkrl:
+    def test_to_dict_cfg_creates_runner_and_loads_checkpoint(self) -> None:
+        cfg = MagicMock()
+        cfg.to_dict.return_value = {"a": 1}
+        stubs, runner_mod = _skrl_module_stubs(cfg=cfg)
+        env = MagicMock()
+
+        with patch.dict(sys.modules, stubs):
+            agent = _load_skrl("/tmp/ckpt.pt", "Reach-v0", env, "cuda")
+
+        runner_mod.Runner.assert_called_once_with(env, {"a": 1})
+        runner_instance = runner_mod.Runner.return_value
+        runner_instance.agent.load.assert_called_once_with("/tmp/ckpt.pt")
+        runner_instance.agent.enable_training_mode.assert_called_once_with(enabled=False, apply_to_models=True)
+        assert agent is runner_instance.agent
+
+    def test_dict_cfg_used_directly(self) -> None:
+        cfg = {"b": 2}
+        stubs, runner_mod = _skrl_module_stubs(cfg=cfg)
+
+        with patch.dict(sys.modules, stubs):
+            _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda")
+
+        runner_mod.Runner.assert_called_once()
+        assert runner_mod.Runner.call_args.args[1] == {"b": 2}
+
+    def test_unsupported_cfg_type_raises(self) -> None:
+        cfg = object()  # no to_dict, not a dict
+        stubs, _ = _skrl_module_stubs(cfg=cfg)
+
+        with patch.dict(sys.modules, stubs), pytest.raises(ValueError, match="Unexpected agent config type"):
+            _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda")
+
+    def test_missing_cfg_raises(self) -> None:
+        stubs, _ = _skrl_module_stubs(decorator_calls_inner=False)
+
+        with patch.dict(sys.modules, stubs), pytest.raises(ValueError, match="Could not load agent configuration"):
+            _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda")
+
+    def test_restores_sys_argv_after_call(self) -> None:
+        cfg = {"a": 1}
+        stubs, _ = _skrl_module_stubs(cfg=cfg)
+        sentinel_argv = ["prog", "--keep", "me"]
+
+        with patch.dict(sys.modules, stubs), patch.object(sys, "argv", sentinel_argv):
+            _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda")
+            assert sys.argv == sentinel_argv
+
+
+def _main_module_stubs():
+    """Build sys.modules patch dict for main() tests."""
+    isaaclab_app = MagicMock()
+    gym_mod = MagicMock()
+    parse_cfg_mod = MagicMock()
+    skrl_rl_mod = MagicMock()
+    return {
+        "isaaclab": MagicMock(),
+        "isaaclab.app": isaaclab_app,
+        "isaaclab_tasks": MagicMock(),
+        "isaaclab_tasks.utils": MagicMock(),
+        "isaaclab_tasks.utils.parse_cfg": parse_cfg_mod,
+        "isaaclab_rl": MagicMock(),
+        "isaaclab_rl.skrl": skrl_rl_mod,
+        "gymnasium": gym_mod,
+    }
+
+
+class TestMain:
+    def test_missing_task_returns_one(self) -> None:
+        argv = ["prog", "--model-path", "/tmp/m"]
+        with (
+            patch.object(sys, "argv", argv),
+            patch.object(policy_evaluation.os, "_exit") as mock_exit,
+        ):
+            rc = main()
+        assert rc == 1
+        # Early return path: os._exit is only invoked from the try/finally
+        # that wraps successful evaluation, not from the missing-task guard.
+        mock_exit.assert_not_called()
+
+    def test_success_path_returns_zero(self) -> None:
+        argv = ["prog", "--model-path", "/tmp/m", "--task", "Lift-v0", "--success-threshold", "0.5"]
+        metrics = MagicMock()
+        metrics.to_dict.return_value = {"success_rate": 0.9}
+        with (
+            patch.object(sys, "argv", argv),
+            patch.dict(sys.modules, _main_module_stubs()),
+            patch("sil.policy_evaluation.find_checkpoint", return_value="/tmp/ckpt.pt"),
+            patch("sil.policy_evaluation.load_agent", return_value=MagicMock()),
+            patch("sil.policy_evaluation.evaluate", return_value=metrics),
+            patch("sil.policy_evaluation.prepare_for_shutdown"),
+            patch.object(policy_evaluation.os, "_exit") as mock_exit,
+        ):
+            rc = main()
+        assert rc == 0
+        mock_exit.assert_called_once_with(0)
+
+    def test_below_threshold_returns_one(self) -> None:
+        argv = ["prog", "--model-path", "/tmp/m", "--task", "Lift-v0", "--success-threshold", "0.9"]
+        metrics = MagicMock()
+        metrics.to_dict.return_value = {"success_rate": 0.1}
+        with (
+            patch.object(sys, "argv", argv),
+            patch.dict(sys.modules, _main_module_stubs()),
+            patch("sil.policy_evaluation.find_checkpoint", return_value="/tmp/ckpt.pt"),
+            patch("sil.policy_evaluation.load_agent", return_value=MagicMock()),
+            patch("sil.policy_evaluation.evaluate", return_value=metrics),
+            patch("sil.policy_evaluation.prepare_for_shutdown"),
+            patch.object(policy_evaluation.os, "_exit") as mock_exit,
+        ):
+            rc = main()
+        assert rc == 1
+        mock_exit.assert_called_once_with(1)
+
+    def test_rsl_rl_framework_skips_skrl_wrapper(self) -> None:
+        argv = [
+            "prog",
+            "--model-path",
+            "/tmp/m",
+            "--task",
+            "Lift-v0",
+            "--framework",
+            "rsl_rl",
+            "--success-threshold",
+            "0.0",
+        ]
+        metrics = MagicMock()
+        metrics.to_dict.return_value = {"success_rate": 1.0}
+        stubs = _main_module_stubs()
+        with (
+            patch.object(sys, "argv", argv),
+            patch.dict(sys.modules, stubs),
+            patch("sil.policy_evaluation.find_checkpoint", return_value="/tmp/ckpt.pt"),
+            patch("sil.policy_evaluation.load_agent", return_value=MagicMock()),
+            patch("sil.policy_evaluation.evaluate", return_value=metrics),
+            patch("sil.policy_evaluation.prepare_for_shutdown"),
+            patch.object(policy_evaluation.os, "_exit"),
+        ):
+            rc = main()
+        assert rc == 0
+        stubs["isaaclab_rl.skrl"].SkrlVecEnvWrapper.assert_not_called()
+
+    def test_exception_in_try_returns_one(self) -> None:
+        argv = ["prog", "--model-path", "/tmp/m", "--task", "Lift-v0"]
+        with (
+            patch.object(sys, "argv", argv),
+            patch.dict(sys.modules, _main_module_stubs()),
+            patch("sil.policy_evaluation.find_checkpoint", side_effect=RuntimeError("boom")),
+            patch.object(policy_evaluation.os, "_exit") as mock_exit,
+        ):
+            rc = main()
+        assert rc == 1
+        mock_exit.assert_called_once_with(1)
diff --git a/evaluation/tests/test_policy_runner.py b/evaluation/tests/test_policy_runner.py
new file mode 100644
index 00000000..3ef0eac3
--- /dev/null
+++ b/evaluation/tests/test_policy_runner.py
@@ -0,0 +1,184 @@
+"""Unit tests for ``sil.policy_runner``."""
+
+from __future__ import annotations
+
+import sys
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+
+from sil.policy_runner import InferenceMetrics, PolicyRunner, _resolve_device  # noqa: E402
+from sil.robot_types import NUM_JOINTS, JointPositionCommand, RobotObservation  # noqa: E402
+
+
+class TestResolveDevice:
+    """Device resolution with CUDA / MPS / CPU fallback chain."""
+
+    def test_cuda_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+        assert _resolve_device("cuda") == "cuda"
+
+    def test_cuda_falls_back_to_mps(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True)
+        assert _resolve_device("cuda") == "mps"
+
+    def test_mps_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True)
+        assert _resolve_device("mps") == "mps"
+
+    def test_mps_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False)
+        assert _resolve_device("mps") == "cpu"
+
+    def test_cpu_always_returns_cpu(self) -> None:
+        assert _resolve_device("cpu") == "cpu"
+
+    def test_all_unavailable_returns_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False)
+        assert _resolve_device("cuda") == "cpu"
+
+
+class TestInferenceMetrics:
+    """InferenceMetrics dataclass defaults and computed properties."""
+
+    def test_defaults_are_zero(self) -> None:
+        m = InferenceMetrics()
+        assert m.steps == 0
+        assert m.total_inference_s == 0.0
+        assert m.total_preprocess_s == 0.0
+        assert m.chunk_queries == 0
+
+    def test_avg_inference_ms(self) -> None:
+        m = InferenceMetrics(steps=10, total_inference_s=0.5)
+        assert m.avg_inference_ms == pytest.approx(50.0)
+
+    def test_avg_preprocess_ms(self) -> None:
+        m = InferenceMetrics(steps=10, total_preprocess_s=0.2)
+        assert m.avg_preprocess_ms == pytest.approx(20.0)
+
+    def test_zero_steps_avoids_division_by_zero(self) -> None:
+        m = InferenceMetrics()
+        assert m.avg_inference_ms == 0.0
+        assert m.avg_preprocess_ms == 0.0
+
+
+class TestPolicyRunner:
+    """PolicyRunner with mock policy and processors."""
+
+    @pytest.fixture
+    def action_tensor(self) -> torch.Tensor:
+        return torch.randn(1, NUM_JOINTS)
+
+    @pytest.fixture
+    def runner(self, action_tensor: torch.Tensor) -> PolicyRunner:
+        policy = MagicMock()
+        policy.select_action.return_value = action_tensor
+        preprocessor = MagicMock(side_effect=lambda x: x)
+        postprocessor = MagicMock(return_value={"action": action_tensor})
+        return PolicyRunner(policy, preprocessor, postprocessor, "cpu")
+
+    def test_device_property(self, runner: PolicyRunner) -> None:
+        assert runner.device == "cpu"
+
+    def test_reset_clears_metrics_and_policy(self, runner: PolicyRunner) -> None:
+        runner._metrics.steps = 5
+        runner.reset()
+        assert runner.metrics.steps == 0
+        runner._policy.reset.assert_called_once()
+
+    def test_step_null_image_returns_zeros(
+        self,
+        runner: PolicyRunner,
+        joint_positions: np.ndarray,
+    ) -> None:
+        obs = RobotObservation(joint_positions=joint_positions)
+        cmd = runner.step(obs)
+        np.testing.assert_array_equal(cmd.positions, np.zeros(NUM_JOINTS, dtype=np.float32))
+        assert cmd.timestamp_s == 0.0
+
+    def test_step_runs_full_pipeline(
+        self,
+        joint_positions: np.ndarray,
+        color_image: np.ndarray,
+        action_tensor: torch.Tensor,
+    ) -> None:
+        policy = MagicMock()
+        policy.select_action.return_value = action_tensor
+        preprocessor = MagicMock(side_effect=lambda x: x)
+        postprocessor = MagicMock(return_value={"action": action_tensor})
+        runner = PolicyRunner(policy, preprocessor, postprocessor, "cpu")
+
+        obs = RobotObservation(
+            joint_positions=joint_positions,
+            color_image=color_image,
+            timestamp_s=1.5,
+        )
+        cmd = runner.step(obs)
+
+        assert isinstance(cmd, JointPositionCommand)
+        assert cmd.positions.shape == (NUM_JOINTS,)
+        assert cmd.timestamp_s == 1.5
+        preprocessor.assert_called_once()
+        policy.select_action.assert_called_once()
+        postprocessor.assert_called_once()
+
+    def test_step_increments_metrics(
+        self,
+        runner: PolicyRunner,
+        joint_positions: np.ndarray,
+        color_image: np.ndarray,
+    ) -> None:
+        obs = RobotObservation(joint_positions=joint_positions, color_image=color_image)
+        runner.step(obs)
+        runner.step(obs)
+        assert runner.metrics.steps == 2
+        assert runner.metrics.total_inference_s >= 0
+        assert runner.metrics.total_preprocess_s >= 0
+
+
+class TestPolicyRunnerFromPretrained:
+    """from_pretrained classmethod with mocked lerobot imports."""
+
+    @pytest.fixture
+    def lerobot_mocks(self, monkeypatch: pytest.MonkeyPatch) -> tuple[MagicMock, MagicMock]:
+        mock_act = MagicMock()
+        mock_pipeline = MagicMock()
+        for mod in ("lerobot", "lerobot.policies", "lerobot.policies.act", "lerobot.processor"):
+            monkeypatch.setitem(sys.modules, mod, MagicMock())
+        monkeypatch.setitem(sys.modules, "lerobot.policies.act.modeling_act", mock_act)
+        monkeypatch.setitem(sys.modules, "lerobot.processor.pipeline", mock_pipeline)
+        return mock_act, mock_pipeline
+
+    def test_loads_and_configures_policy(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+        lerobot_mocks: tuple[MagicMock, MagicMock],
+    ) -> None:
+        mock_act, mock_pipeline = lerobot_mocks
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False)
+
+        runner = PolicyRunner.from_pretrained("test/repo", device="cuda")
+
+        assert runner.device == "cpu"
+        mock_act.ACTPolicy.from_pretrained.assert_called_once_with("test/repo")
+        mock_act.ACTPolicy.from_pretrained.return_value.to.assert_called_once_with("cpu")
+        assert mock_pipeline.PolicyProcessorPipeline.from_pretrained.call_count == 2
+
+    def test_uses_cuda_when_available(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+        lerobot_mocks: tuple[MagicMock, MagicMock],
+    ) -> None:
+        mock_act, _ = lerobot_mocks
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+
+        runner = PolicyRunner.from_pretrained("test/repo")
+
+        assert runner.device == "cuda"
+        mock_act.ACTPolicy.from_pretrained.return_value.to.assert_called_once_with("cuda")
diff --git a/evaluation/tests/test_robot_types.py b/evaluation/tests/test_robot_types.py
new file mode 100644
index 00000000..f0611057
--- /dev/null
+++ b/evaluation/tests/test_robot_types.py
@@ -0,0 +1,118 @@
+"""Unit tests for ``sil.robot_types``."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+from sil.robot_types import (
+    IMAGE_CHANNELS,
+    IMAGE_HEIGHT,
+    IMAGE_WIDTH,
+    JOINT_ORDER,
+    NUM_JOINTS,
+    JointName,
+    JointPositionCommand,
+    RobotObservation,
+    RobotState,
+)
+
+
+class TestJointName:
+    def test_enum_members(self) -> None:
+        expected = {
+            "SHOULDER_PAN",
+            "SHOULDER_LIFT",
+            "ELBOW",
+            "WRIST_1",
+            "WRIST_2",
+            "WRIST_3",
+        }
+        assert {member.name for member in JointName} == expected
+
+    def test_joint_order_length(self) -> None:
+        assert len(JOINT_ORDER) == NUM_JOINTS
+        assert NUM_JOINTS == 6
+
+    def test_string_values(self) -> None:
+        for member in JointName:
+            assert member.value.endswith("_joint")
+
+
+class TestRobotObservation:
+    def test_valid_construction(self, joint_positions: np.ndarray) -> None:
+        obs = RobotObservation(joint_positions=joint_positions)
+        assert obs.joint_positions.shape == (NUM_JOINTS,)
+        assert obs.color_image is None
+        assert obs.timestamp_s == 0.0
+
+    def test_with_color_image(self, joint_positions: np.ndarray, color_image: np.ndarray) -> None:
+        obs = RobotObservation(joint_positions=joint_positions, color_image=color_image)
+        assert obs.color_image is not None
+        assert obs.color_image.shape == (IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)
+
+    def test_invalid_joint_shape(self) -> None:
+        with pytest.raises(ValueError, match="joint_positions shape"):
+            RobotObservation(joint_positions=np.zeros(3))
+
+    def test_invalid_image_shape(self, joint_positions: np.ndarray) -> None:
+        with pytest.raises(ValueError, match="color_image shape"):
+            RobotObservation(
+                joint_positions=joint_positions,
+                color_image=np.zeros((10, 10, 3), dtype=np.uint8),
+            )
+
+    def test_none_color_image(self, joint_positions: np.ndarray) -> None:
+        obs = RobotObservation(joint_positions=joint_positions, color_image=None)
+        assert obs.color_image is None
+
+
+class TestJointPositionCommand:
+    def test_valid_construction(self, joint_positions: np.ndarray) -> None:
+        cmd = JointPositionCommand(positions=joint_positions, timestamp_s=1.5)
+        assert cmd.positions.shape == (NUM_JOINTS,)
+        assert cmd.timestamp_s == 1.5
+
+    def test_invalid_shape(self) -> None:
+        with pytest.raises(ValueError, match="positions shape"):
+            JointPositionCommand(positions=np.zeros(4))
+
+    def test_as_absolute(self, rng: np.random.Generator) -> None:
+        deltas = rng.normal(0, 0.1, size=(NUM_JOINTS,))
+        current = rng.normal(0, 1.0, size=(NUM_JOINTS,))
+        cmd = JointPositionCommand(positions=deltas)
+        absolute = cmd.as_absolute(current)
+        np.testing.assert_allclose(absolute.positions, current + deltas)
+
+    def test_as_absolute_preserves_timestamp(self, joint_positions: np.ndarray) -> None:
+        cmd = JointPositionCommand(positions=joint_positions, timestamp_s=3.14)
+        absolute = cmd.as_absolute(np.zeros(NUM_JOINTS))
+        assert absolute.timestamp_s == 3.14
+
+
+class TestRobotState:
+    def test_default_state(self) -> None:
+        state = RobotState()
+        assert state.observation is None
+        assert state.episode_step == 0
+        assert state.is_episode_active is False
+        assert state.action_queue == []
+
+    def test_clear_episode(self, joint_positions: np.ndarray) -> None:
+        state = RobotState(
+            observation=RobotObservation(joint_positions=joint_positions),
+            episode_step=42,
+            is_episode_active=True,
+        )
+        state.clear_episode()
+        assert state.episode_step == 0
+        assert state.is_episode_active is False
+
+    def test_clear_episode_empties_queue(self, joint_positions: np.ndarray) -> None:
+        state = RobotState(
+            action_queue=[
+                JointPositionCommand(positions=joint_positions),
+                JointPositionCommand(positions=joint_positions),
+            ],
+        )
+        state.clear_episode()
+        assert state.action_queue == []
diff --git a/evaluation/tests/test_run_local_lerobot_eval.py b/evaluation/tests/test_run_local_lerobot_eval.py
new file mode 100644
index 00000000..dc012a83
--- /dev/null
+++ b/evaluation/tests/test_run_local_lerobot_eval.py
@@ -0,0 +1,546 @@
+"""Tests for sil/scripts/run-local-lerobot-eval.py."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+import types
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+
+# Stub heavy / external deps before script import.
+if "pyarrow" not in sys.modules:
+    _pa = types.ModuleType("pyarrow")
+    _pq = types.ModuleType("pyarrow.parquet")
+    _pq.read_table = MagicMock()
+    _pa.parquet = _pq
+    sys.modules["pyarrow"] = _pa
+    sys.modules["pyarrow.parquet"] = _pq
+
+if "av" not in sys.modules:
+    sys.modules["av"] = types.ModuleType("av")
+
+for _n in ("lerobot", "lerobot.policies", "lerobot.policies.act"):
+    sys.modules.setdefault(_n, types.ModuleType(_n))
+sys.modules.setdefault(
+    "lerobot.policies.act.modeling_act",
+    types.ModuleType("lerobot.policies.act.modeling_act"),
+)
+sys.modules.setdefault("safetensors", types.ModuleType("safetensors"))
+sys.modules.setdefault("safetensors.torch", types.ModuleType("safetensors.torch"))
+
+_SCRIPT = Path(__file__).resolve().parents[1] / "sil" / "scripts" / "run-local-lerobot-eval.py"
+_spec = importlib.util.spec_from_file_location("run_local_lerobot_eval", _SCRIPT)
+_mod = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_mod)
+
+
+# ---------------- helpers ----------------
+
+
+def _make_args(**overrides) -> SimpleNamespace:
+    defaults = dict(
+        policy_path="/tmp/policy",
+        model_name=None,
+        model_version=None,
+        dataset_dir="/tmp/ds",
+        episodes=1,
+        output_dir="outputs/local-eval",
+        device="cpu",
+    )
+    defaults.update(overrides)
+    return SimpleNamespace(**defaults)
+
+
+def _patch_av(monkeypatch: pytest.MonkeyPatch, frames: list[np.ndarray]) -> None:
+    av_mod = types.ModuleType("av")
+
+    class _Frame:
+        def __init__(self, arr):
+            self._arr = arr
+
+        def to_ndarray(self, format="rgb24"):
+            return self._arr
+
+    class _Stream:
+        pass
+
+    class _Container:
+        def __init__(self):
+            self.streams = SimpleNamespace(video=[_Stream()])
+
+        def decode(self, _stream):
+            return [_Frame(f) for f in frames]
+
+        def close(self):
+            pass
+
+    av_mod.open = lambda _path: _Container()
+    monkeypatch.setitem(sys.modules, "av", av_mod)
+
+
+def _write_info(dataset_dir: Path, fps: int = 30, action_dim: int = 6, state_dim: int = 6) -> dict:
+    meta = dataset_dir / "meta"
+    meta.mkdir(parents=True, exist_ok=True)
+    info = {
+        "fps": fps,
+        "chunks_size": 1000,
+        "total_episodes": 1,
+        "features": {
+            "observation.images.color": {"dtype": "video", "shape": [96, 96, 3]},
+            "observation.state": {"dtype": "float32", "shape": [state_dim]},
+            "action": {"dtype": "float32", "shape": [action_dim]},
+        },
+    }
+    (meta / "info.json").write_text(json.dumps(info))
+    return info
+
+
+def _setup_run_evaluation(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+    *,
+    n_frames: int = 4,
+    n_dims: int = 6,
+    write_episodes_jsonl: bool = False,
+) -> dict:
+    """Set up a working run_evaluation environment."""
+    info = _write_info(tmp_path, action_dim=n_dims, state_dim=n_dims)
+
+    if write_episodes_jsonl:
+        (tmp_path / "meta" / "episodes.jsonl").write_text('{"episode_index": 0}\n')
+
+    # Create chunk-000/episode_000000.parquet (first candidate).
+    data_dir = tmp_path / "data" / "chunk-000"
+    data_dir.mkdir(parents=True)
+    data_file = data_dir / "episode_000000.parquet"
+    data_file.write_bytes(b"")
+
+    video_dir = tmp_path / "videos" / "observation.images.color" / "chunk-000"
+    video_dir.mkdir(parents=True)
+    video_file = video_dir / "episode_000000.mp4"
+    video_file.write_bytes(b"")
+
+    # Mock pq.read_table to return synthetic columnar data.
+    table = MagicMock()
+    table.column_names = ["timestamp", "observation.state", "action"]
+
+    state_list = [np.zeros(n_dims, dtype=np.float32).tolist() for _ in range(n_frames)]
+    action_list = [np.zeros(n_dims, dtype=np.float32).tolist() for _ in range(n_frames)]
+    ts_list = list(range(n_frames))
+
+    def _getitem(col):
+        m = MagicMock()
+        if col == "timestamp":
+            m.to_pylist.return_value = ts_list
+        elif col == "observation.state":
+            m.to_pylist.return_value = state_list
+        else:
+            m.to_pylist.return_value = action_list
+        return m
+
+    table.__getitem__ = lambda self, col: _getitem(col)
+    monkeypatch.setattr(_mod.pq, "read_table", lambda _path: table)
+
+    # Stub video decoding.
+    frames = [np.zeros((96, 96, 3), dtype=np.uint8) for _ in range(n_frames)]
+    monkeypatch.setattr(_mod, "load_video_frames", lambda _p: frames)
+
+    # Stub policy loader.
+    policy = MagicMock()
+    policy.parameters.return_value = [torch.zeros(1)]
+    policy.select_action.return_value = torch.zeros(1, n_dims)
+    policy.to.return_value = policy
+    policy.reset = MagicMock()
+
+    act_mod = sys.modules["lerobot.policies.act.modeling_act"]
+    act_mod.ACTPolicy = SimpleNamespace(from_pretrained=lambda _path: policy)
+
+    monkeypatch.setattr(_mod, "_load_normalizer_stats", lambda *_a, **_k: None)
+
+    return {"policy": policy, "info": info, "frames": frames}
+
+
+# ---------------- TestResolveDevice ----------------
+
+
+class TestResolveDevice:
+    def test_cuda_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+        assert _mod.resolve_device("cuda") == "cuda"
+
+    def test_cuda_falls_back_to_mps(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True)
+        assert _mod.resolve_device("cuda") == "mps"
+
+    def test_mps_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True)
+        assert _mod.resolve_device("mps") == "mps"
+
+    def test_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False)
+        assert _mod.resolve_device("cuda") == "cpu"
+
+    def test_cpu_explicit(self) -> None:
+        assert _mod.resolve_device("cpu") == "cpu"
+
+
+# ---------------- TestFindDataFile ----------------
+
+
+class TestFindDataFile:
+    def test_first_candidate(self, tmp_path: Path) -> None:
+        d = tmp_path / "data" / "chunk-000"
+        d.mkdir(parents=True)
+        f = d / "episode_000000.parquet"
+        f.write_bytes(b"")
+        assert _mod.find_data_file(str(tmp_path), 0, {"chunks_size": 1000}) == str(f)
+
+    def test_second_candidate(self, tmp_path: Path) -> None:
+        d = tmp_path / "data" / "chunk-007"
+        d.mkdir(parents=True)
+        f = d / "file-007.parquet"
+        f.write_bytes(b"")
+        assert _mod.find_data_file(str(tmp_path), 7, {"chunks_size": 1000}) == str(f)
+
+    def test_no_candidate_returns_none(self, tmp_path: Path) -> None:
+        assert _mod.find_data_file(str(tmp_path), 0, {}) is None
+
+
+# ---------------- TestFindVideoFile ----------------
+
+
+class TestFindVideoFile:
+    def test_first_candidate(self, tmp_path: Path) -> None:
+        d = tmp_path / "videos" / "key" / "chunk-000"
+        d.mkdir(parents=True)
+        f = d / "episode_000000.mp4"
+        f.write_bytes(b"")
+        assert _mod.find_video_file(str(tmp_path), "key", 0, {"chunks_size": 1000}) == str(f)
+
+    def test_second_candidate(self, tmp_path: Path) -> None:
+        d = tmp_path / "videos" / "key" / "chunk-005"
+        d.mkdir(parents=True)
+        f = d / "file-005.mp4"
+        f.write_bytes(b"")
+        assert _mod.find_video_file(str(tmp_path), "key", 5, {"chunks_size": 1000}) == str(f)
+
+    def test_no_candidate_returns_none(self, tmp_path: Path) -> None:
+        assert _mod.find_video_file(str(tmp_path), "key", 0, {}) is None
+
+
+# ---------------- TestLoadVideoFrames ----------------
+
+
+class TestLoadVideoFrames:
+    def test_decodes_frames(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        frames = [np.zeros((4, 4, 3), dtype=np.uint8), np.ones((4, 4, 3), dtype=np.uint8)]
+        _patch_av(monkeypatch, frames)
+        result = _mod.load_video_frames("/tmp/x.mp4")
+        assert len(result) == 2
+        assert result[0].shape == (4, 4, 3)
+
+
+# ---------------- TestDownloadAmlModel ----------------
+
+
+class TestDownloadAmlModel:
+    def test_finds_safetensors_in_subdir(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, mock_azure_ml) -> None:
+        download_root = tmp_path / "tmp" / "aml-model-download"
+        sub = download_root / "my-model" / "pretrained_model"
+        sub.mkdir(parents=True)
+        (sub / "model.safetensors").write_bytes(b"")
+        # Also create empty parent so iterdir returns the subdir.
+        monkeypatch.chdir(tmp_path)
+
+        mock_ml, _ = mock_azure_ml
+        client = MagicMock()
+        mock_ml.MLClient = MagicMock(return_value=client)
+
+        ident_mod = sys.modules["azure.identity"]
+        ident_mod.DefaultAzureCredential = MagicMock()
+
+        result = _mod.download_aml_model("my-model", "1")
+        assert result == Path("tmp/aml-model-download/my-model/pretrained_model")
+        client.models.download.assert_called_once()
+
+    def test_finds_bin_files(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, mock_azure_ml) -> None:
+        download_root = tmp_path / "tmp" / "aml-model-download"
+        sub = download_root / "m" / "checkpoint"
+        sub.mkdir(parents=True)
+        (sub / "weights.bin").write_bytes(b"")
+        monkeypatch.chdir(tmp_path)
+
+        mock_ml, _ = mock_azure_ml
+        mock_ml.MLClient = MagicMock(return_value=MagicMock())
+        sys.modules["azure.identity"].DefaultAzureCredential = MagicMock()
+
+        result = _mod.download_aml_model("m", "2")
+        assert result == Path("tmp/aml-model-download/m/checkpoint")
+
+    def test_returns_download_dir_when_no_match(
+        self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, mock_azure_ml
+    ) -> None:
+        # No model_name dir created → model_path = download_dir; iterdir on download_dir
+        # yields nothing matching → loop ends; returns download_dir.
+        monkeypatch.chdir(tmp_path)
+        mock_ml, _ = mock_azure_ml
+        mock_ml.MLClient = MagicMock(return_value=MagicMock())
+        sys.modules["azure.identity"].DefaultAzureCredential = MagicMock()
+
+        result = _mod.download_aml_model("missing", "9")
+        # Falls through to download_dir which has no matching files; the loop iterates
+        # over [download_dir] only because model_path.is_dir() is True but no glob
+        # matches; result remains download_dir.
+        assert result == Path("tmp/aml-model-download")
+
+
+# ---------------- TestLoadNormalizerStats ----------------
+
+
+class TestLoadNormalizerStats:
+    def test_no_files_returns_early(self, tmp_path: Path) -> None:
+        policy = MagicMock()
+        _mod._load_normalizer_stats(policy, tmp_path)
+        policy.load_state_dict.assert_not_called()
+
+    def test_skips_non_processor_files(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+        (tmp_path / "model.safetensors").write_bytes(b"")
+        st = sys.modules["safetensors.torch"]
+        st.load_file = MagicMock(return_value={"observation.state.mean": torch.zeros(6)})
+        policy = MagicMock()
+        _mod._load_normalizer_stats(policy, tmp_path)
+        # File present but lacks "processor" → stats stays empty → early return.
+        policy.load_state_dict.assert_not_called()
+
+    def test_loads_matching_buffers(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+        (tmp_path / "preprocessor.safetensors").write_bytes(b"")
+        st = sys.modules["safetensors.torch"]
+        mean_t = torch.ones(6)
+        std_t = torch.full((6,), 2.0)
+        bad_short = torch.zeros(1)
+        st.load_file = MagicMock(
+            return_value={
+                "observation.state.mean": mean_t,
+                "observation.state.std": std_t,
+                "observation.state.median": bad_short,  # not in {mean,std,min,max}
+                "noseparator": bad_short,  # rsplit gives 1 part
+            }
+        )
+        policy = MagicMock()
+        policy.state_dict.return_value = {
+            "normalize_inputs.buffer_observation_state.mean": torch.zeros(6),
+            "normalize_inputs.buffer_observation_state.std": torch.zeros(6),
+        }
+        _mod._load_normalizer_stats(policy, tmp_path)
+        policy.load_state_dict.assert_called_once()
+        kwargs = policy.load_state_dict.call_args
+        assert kwargs.kwargs.get("strict") is False or kwargs.args[1:] == (False,)
+
+    def test_no_matching_buffers_skips_load(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+        (tmp_path / "preprocessor.safetensors").write_bytes(b"")
+        st = sys.modules["safetensors.torch"]
+        st.load_file = MagicMock(return_value={"observation.state.mean": torch.ones(6)})
+        policy = MagicMock()
+        # state_dict has no matching buffer name.
+        policy.state_dict.return_value = {"unrelated.weight": torch.zeros(2)}
+        _mod._load_normalizer_stats(policy, tmp_path)
+        policy.load_state_dict.assert_not_called()
+
+
+# ---------------- TestRunEvaluation ----------------
+
+
+class TestRunEvaluation:
+    def test_happy_path_writes_results(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5)
+        out = tmp_path / "out"
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(out), episodes=1)
+        _mod.run_evaluation(args)
+        assert (out / "eval_results.json").exists()
+        assert (out / "ep000_predictions.npz").exists()
+        assert (out / "plots" / "ep000_action_deltas.png").exists()
+
+    def test_strips_config_fields(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5)
+        policy_dir = tmp_path / "policy"
+        policy_dir.mkdir()
+        cfg = {"use_peft": True, "pretrained_path": "x", "peft_config": {}, "keep": 1}
+        (policy_dir / "config.json").write_text(json.dumps(cfg))
+        args = _make_args(
+            dataset_dir=str(tmp_path),
+            output_dir=str(tmp_path / "out"),
+            policy_path=str(policy_dir),
+        )
+        _mod.run_evaluation(args)
+        new_cfg = json.loads((policy_dir / "config.json").read_text())
+        assert "use_peft" not in new_cfg
+        assert new_cfg["keep"] == 1
+
+    def test_config_without_strip_fields_unchanged(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5)
+        policy_dir = tmp_path / "policy"
+        policy_dir.mkdir()
+        (policy_dir / "config.json").write_text(json.dumps({"keep": 1}))
+        args = _make_args(
+            dataset_dir=str(tmp_path),
+            output_dir=str(tmp_path / "out"),
+            policy_path=str(policy_dir),
+        )
+        _mod.run_evaluation(args)
+        assert json.loads((policy_dir / "config.json").read_text()) == {"keep": 1}
+
+    def test_skip_no_data(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4)
+        # Remove the data file → find_data_file returns None.
+        (tmp_path / "data" / "chunk-000" / "episode_000000.parquet").unlink()
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"))
+        _mod.run_evaluation(args)
+        # No metrics → early return, no eval_results.json.
+        assert not (tmp_path / "out" / "eval_results.json").exists()
+
+    def test_skip_no_video(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4)
+        (tmp_path / "videos" / "observation.images.color" / "chunk-000" / "episode_000000.mp4").unlink()
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"))
+        _mod.run_evaluation(args)
+        assert not (tmp_path / "out" / "eval_results.json").exists()
+
+    def test_image_key_fallback(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        # Write info with no video/image features → falls back to default key.
+        meta = tmp_path / "meta"
+        meta.mkdir()
+        info = {
+            "fps": 30,
+            "chunks_size": 1000,
+            "total_episodes": 1,
+            "features": {
+                "observation.state": {"dtype": "float32", "shape": [6]},
+                "action": {"dtype": "float32", "shape": [6]},
+            },
+        }
+        (meta / "info.json").write_text(json.dumps(info))
+        # No data file → skipped, but path through image_key fallback must execute.
+        policy = MagicMock()
+        policy.parameters.return_value = [torch.zeros(1)]
+        policy.to.return_value = policy
+        sys.modules["lerobot.policies.act.modeling_act"].ACTPolicy = SimpleNamespace(from_pretrained=lambda _p: policy)
+        monkeypatch.setattr(_mod, "_load_normalizer_stats", lambda *_a, **_k: None)
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"))
+        _mod.run_evaluation(args)
+
+    def test_episodes_jsonl_total(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4, write_episodes_jsonl=True)
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"), episodes=10)
+        _mod.run_evaluation(args)
+        assert (tmp_path / "out" / "eval_results.json").exists()
+
+    def test_n_dims_one_branch(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4, n_dims=1)
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"))
+        _mod.run_evaluation(args)
+        assert (tmp_path / "out" / "eval_results.json").exists()
+
+    def test_many_dims_small_labelsize(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4, n_dims=10)
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"))
+        _mod.run_evaluation(args)
+        assert (tmp_path / "out" / "eval_results.json").exists()
+
+    def test_step_print_at_end(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        # n_frames=5 → num_steps=4 → step iterates 0..3, both branches of `step<3 or last` fire.
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5)
+        args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"))
+        _mod.run_evaluation(args)
+
+    def test_aml_branch_invokes_download(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4)
+        called = {}
+
+        def fake_download(name, version):
+            called["name"] = name
+            called["version"] = version
+            return tmp_path / "downloaded"
+
+        monkeypatch.setattr(_mod, "download_aml_model", fake_download)
+        args = _make_args(
+            dataset_dir=str(tmp_path),
+            output_dir=str(tmp_path / "out"),
+            policy_path=None,
+            model_name="m",
+            model_version="1",
+        )
+        _mod.run_evaluation(args)
+        assert called["name"] == "m" and called["version"] == "1"
+
+
+# ---------------- TestMain ----------------
+
+
+class TestMain:
+    def test_no_policy_source_errors(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        monkeypatch.setattr(sys, "argv", ["run-local-lerobot-eval", "--dataset-dir", str(tmp_path)])
+        with pytest.raises(SystemExit):
+            _mod.main()
+
+    def test_missing_dataset_exits(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "run-local-lerobot-eval",
+                "--policy-path",
+                "/x",
+                "--dataset-dir",
+                str(tmp_path / "missing"),
+            ],
+        )
+        with pytest.raises(SystemExit) as exc:
+            _mod.main()
+        assert exc.value.code == 1
+
+    def test_invokes_run_evaluation_policy_path(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        called = {}
+        monkeypatch.setattr(_mod, "run_evaluation", lambda a: called.setdefault("a", a))
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "run-local-lerobot-eval",
+                "--policy-path",
+                "/x",
+                "--dataset-dir",
+                str(tmp_path),
+            ],
+        )
+        _mod.main()
+        assert called["a"].policy_path == "/x"
+
+    def test_invokes_run_evaluation_aml(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        called = {}
+        monkeypatch.setattr(_mod, "run_evaluation", lambda a: called.setdefault("a", a))
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "run-local-lerobot-eval",
+                "--model-name",
+                "m",
+                "--model-version",
+                "2",
+                "--dataset-dir",
+                str(tmp_path),
+            ],
+        )
+        _mod.main()
+        assert called["a"].model_name == "m"
diff --git a/evaluation/tests/test_test_lerobot_eval.py b/evaluation/tests/test_test_lerobot_eval.py
new file mode 100644
index 00000000..219f3c21
--- /dev/null
+++ b/evaluation/tests/test_test_lerobot_eval.py
@@ -0,0 +1,246 @@
+"""Unit tests for ``sil/scripts/test-lerobot-eval.py``."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+import types
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+
+# Stub heavy/native deps before loading the script.
+if "pyarrow" not in sys.modules:
+    _pa = types.ModuleType("pyarrow")
+    _pa_pq = types.ModuleType("pyarrow.parquet")
+    _pa_pq.read_table = MagicMock()
+    _pa.parquet = _pa_pq  # type: ignore[attr-defined]
+    sys.modules["pyarrow"] = _pa
+    sys.modules["pyarrow.parquet"] = _pa_pq
+
+if "av" not in sys.modules:
+    sys.modules["av"] = types.ModuleType("av")
+
+# lerobot.* imports happen inside run_inference_test only; ensure stubs exist.
+for _name in ("lerobot", "lerobot.policies", "lerobot.policies.act", "lerobot.processor"):
+    sys.modules.setdefault(_name, types.ModuleType(_name))
+sys.modules.setdefault("lerobot.policies.act.modeling_act", types.ModuleType("lerobot.policies.act.modeling_act"))
+sys.modules.setdefault("lerobot.processor.pipeline", types.ModuleType("lerobot.processor.pipeline"))
+
+_SCRIPT = Path(__file__).resolve().parents[1] / "sil" / "scripts" / "test-lerobot-eval.py"
+_spec = importlib.util.spec_from_file_location("test_lerobot_eval_script", _SCRIPT)
+assert _spec and _spec.loader
+_mod = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_mod)
+
+
+# ---------------------------------------------------------------------------
+# Pure helpers
+# ---------------------------------------------------------------------------
+
+
+class TestResolveDevice:
+    def test_cuda_available(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+        assert _mod.resolve_device("cuda") == "cuda"
+
+    def test_cuda_falls_back_to_mps(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True)
+        assert _mod.resolve_device("cuda") == "mps"
+
+    def test_mps_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True)
+        assert _mod.resolve_device("mps") == "mps"
+
+    def test_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+        monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False)
+        assert _mod.resolve_device("cuda") == "cpu"
+
+    def test_cpu_explicit(self) -> None:
+        assert _mod.resolve_device("cpu") == "cpu"
+
+
+class TestBuildObservation:
+    def test_returns_expected_keys_and_shapes(self) -> None:
+        state = np.zeros(6, dtype=np.float32)
+        image = np.zeros((4, 4, 3), dtype=np.uint8)
+        obs = _mod.build_observation(state, image)
+        assert set(obs.keys()) == {"observation.state", "observation.images.color"}
+        assert obs["observation.state"].shape == (6,)
+        assert obs["observation.images.color"].shape == (3, 4, 4)
+        assert float(obs["observation.images.color"].max()) <= 1.0
+
+
+class TestLoadEpisodeData:
+    def test_returns_dict_of_columns(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        table = MagicMock()
+        table.column_names = ["timestamp", "action"]
+        table.__getitem__.side_effect = lambda k: MagicMock(to_pylist=lambda: [1, 2, 3])
+        monkeypatch.setattr(_mod.pq, "read_table", lambda _p: table)
+        result = _mod.load_episode_data(str(tmp_path), 0)
+        assert result == {"timestamp": [1, 2, 3], "action": [1, 2, 3]}
+
+
+class TestLoadVideoFrame:
+    def _patch_av(self, monkeypatch: pytest.MonkeyPatch, frames: list[np.ndarray]) -> MagicMock:
+        av_mod = MagicMock()
+        container = MagicMock()
+        stream = MagicMock()
+        container.streams.video = [stream]
+
+        def make_av_frame(arr: np.ndarray) -> MagicMock:
+            f = MagicMock()
+            f.to_ndarray.return_value = arr
+            return f
+
+        container.decode.return_value = [make_av_frame(a) for a in frames]
+        av_mod.open.return_value = container
+        monkeypatch.setitem(sys.modules, "av", av_mod)
+        return container
+
+    def test_returns_target_frame(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        frames = [np.full((2, 2, 3), i, dtype=np.uint8) for i in range(3)]
+        container = self._patch_av(monkeypatch, frames)
+        result = _mod.load_video_frame(str(tmp_path), 0, 1)
+        assert result[0, 0, 0] == 1
+        container.close.assert_called()
+
+    def test_missing_frame_raises(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        self._patch_av(monkeypatch, [np.zeros((2, 2, 3), dtype=np.uint8)])
+        with pytest.raises(IndexError):
+            _mod.load_video_frame(str(tmp_path), 0, 99)
+
+
+# ---------------------------------------------------------------------------
+# main / run_inference_test
+# ---------------------------------------------------------------------------
+
+
+def _make_args(**overrides) -> object:
+    defaults = dict(
+        policy_repo="repo",
+        dataset_dir="/tmp/ds",
+        episode=0,
+        start_frame=0,
+        num_steps=2,
+        device="cpu",
+        output=None,
+    )
+    defaults.update(overrides)
+    return types.SimpleNamespace(**defaults)
+
+
+def _setup_run_inference_test(monkeypatch: pytest.MonkeyPatch, tmp_path: Path, num_frames: int = 5) -> dict:
+    """Patch all heavy dependencies of run_inference_test and return the mocks."""
+    # Mock ACTPolicy + PolicyProcessorPipeline via the modules already in sys.modules.
+    act_mod = sys.modules["lerobot.policies.act.modeling_act"]
+    pipeline_mod = sys.modules["lerobot.processor.pipeline"]
+
+    policy = MagicMock()
+    # parameters() must yield tensor-likes with .numel()
+    param = MagicMock()
+    param.numel.return_value = 1_000_000
+    policy.parameters.return_value = [param, param]
+    act_policy_cls = MagicMock()
+    act_policy_cls.from_pretrained.return_value = policy
+    monkeypatch.setattr(act_mod, "ACTPolicy", act_policy_cls, raising=False)
+
+    preprocessor = MagicMock(side_effect=lambda x: x)
+    preprocessor.steps = []
+    postprocessor = MagicMock(return_value={"action": torch.zeros(1, 6)})
+    postprocessor.steps = []
+    pipeline_cls = MagicMock()
+    pipeline_cls.from_pretrained.side_effect = [preprocessor, postprocessor]
+    monkeypatch.setattr(pipeline_mod, "PolicyProcessorPipeline", pipeline_cls, raising=False)
+
+    # Dataset info file.
+    info = {
+        "fps": 30,
+        "features": {
+            "action": {"shape": [6]},
+            "observation.state": {"shape": [6]},
+            "observation.images.color": {"shape": [3, 480, 640]},
+        },
+    }
+    meta_dir = tmp_path / "meta"
+    meta_dir.mkdir()
+    (meta_dir / "info.json").write_text(json.dumps(info))
+
+    # Episode parquet data.
+    ep_data = {
+        "timestamp": list(range(num_frames)),
+        "observation.state": [[0.0] * 6 for _ in range(num_frames)],
+        "action": [[0.1] * 6 for _ in range(num_frames)],
+    }
+    monkeypatch.setattr(_mod, "load_episode_data", lambda d, e: ep_data)
+    monkeypatch.setattr(
+        _mod,
+        "load_video_frame",
+        lambda d, e, f: np.zeros((480, 640, 3), dtype=np.uint8),
+    )
+
+    # Force device to cpu.
+    monkeypatch.setattr(_mod, "resolve_device", lambda r: "cpu")
+    return {"policy": policy, "preprocessor": preprocessor, "postprocessor": postprocessor}
+
+
+class TestRunInferenceTest:
+    def test_completes_and_writes_output(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_inference_test(monkeypatch, tmp_path)
+        out_file = tmp_path / "preds.npz"
+        args = _make_args(dataset_dir=str(tmp_path), num_steps=10, output=str(out_file))
+        _mod.run_inference_test(args)
+        assert out_file.exists()
+
+    def test_no_output_skips_save(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_inference_test(monkeypatch, tmp_path)
+        args = _make_args(dataset_dir=str(tmp_path), num_steps=2, output=None)
+        _mod.run_inference_test(args)
+
+    def test_warns_on_degenerate_outputs(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys) -> None:
+        mocks = _setup_run_inference_test(monkeypatch, tmp_path)
+        # Force NaN + Inf + zero-variance predictions.
+        bad = torch.full((1, 6), float("nan"))
+        bad[0, 0] = float("inf")
+        mocks["postprocessor"].return_value = {"action": bad}
+        args = _make_args(dataset_dir=str(tmp_path), num_steps=3)
+        _mod.run_inference_test(args)
+        out = capsys.readouterr().out
+        assert "NaN" in out
+        assert "Inf" in out
+
+    def test_zero_variance_warning(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys) -> None:
+        _setup_run_inference_test(monkeypatch, tmp_path)
+        # Default postprocessor returns zeros each step → zero variance.
+        args = _make_args(dataset_dir=str(tmp_path), num_steps=3)
+        _mod.run_inference_test(args)
+        out = capsys.readouterr().out
+        assert "mode collapse" in out
+
+
+class TestMain:
+    def test_missing_dataset_exits(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        monkeypatch.setattr(sys, "argv", ["test-lerobot-eval", "--dataset-dir", str(tmp_path / "missing")])
+        with pytest.raises(SystemExit) as exc_info:
+            _mod.main()
+        assert exc_info.value.code == 1
+
+    def test_invokes_run_inference(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+        _setup_run_inference_test(monkeypatch, tmp_path)
+        called = {}
+
+        def fake_run(args):
+            called["args"] = args
+
+        monkeypatch.setattr(_mod, "run_inference_test", fake_run)
+        monkeypatch.setattr(sys, "argv", ["test-lerobot-eval", "--dataset-dir", str(tmp_path), "--num-steps", "1"])
+        _mod.main()
+        assert called["args"].dataset_dir == str(tmp_path)
diff --git a/evaluation/tests/test_upload_artifacts.py b/evaluation/tests/test_upload_artifacts.py
new file mode 100644
index 00000000..ca758ccc
--- /dev/null
+++ b/evaluation/tests/test_upload_artifacts.py
@@ -0,0 +1,656 @@
+"""Unit tests for ``metrics.upload_artifacts``."""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from metrics.upload_artifacts import (
+    get_video_search_paths,
+    load_metrics,
+    main,
+    upload_to_blob_fallback,
+    upload_to_mlflow,
+)
+
+
+class TestLoadMetrics:
+    def test_both_files_exist(self, tmp_path: Path) -> None:
+        onnx_payload = {"latency_ms": 12.3}
+        jit_payload = {"latency_ms": 45.6}
+        (tmp_path / "onnx_metrics.json").write_text(json.dumps(onnx_payload))
+        (tmp_path / "jit_metrics.json").write_text(json.dumps(jit_payload))
+
+        onnx, jit = load_metrics(tmp_path)
+
+        assert onnx == onnx_payload
+        assert jit == jit_payload
+
+    def test_only_onnx(self, tmp_path: Path) -> None:
+        (tmp_path / "onnx_metrics.json").write_text(json.dumps({"a": 1}))
+
+        onnx, jit = load_metrics(tmp_path)
+
+        assert onnx == {"a": 1}
+        assert jit == {}
+
+    def test_only_jit(self, tmp_path: Path) -> None:
+        (tmp_path / "jit_metrics.json").write_text(json.dumps({"b": 2}))
+
+        onnx, jit = load_metrics(tmp_path)
+
+        assert onnx == {}
+        assert jit == {"b": 2}
+
+    def test_neither_exists(self, tmp_path: Path) -> None:
+        onnx, jit = load_metrics(tmp_path)
+
+        assert onnx == {}
+        assert jit == {}
+
+
+class TestGetVideoSearchPaths:
+    def test_returns_five_paths(self, tmp_path: Path) -> None:
+        paths = get_video_search_paths(tmp_path / "export")
+        assert len(paths) == 5
+        assert all(isinstance(p, Path) for p in paths)
+
+    def test_first_path_is_relative(self, tmp_path: Path) -> None:
+        export_dir = tmp_path / "export"
+        paths = get_video_search_paths(export_dir)
+        assert paths[0] == export_dir / "videos"
+
+    def test_parent_videos_path(self, tmp_path: Path) -> None:
+        export_dir = tmp_path / "nested" / "export"
+        paths = get_video_search_paths(export_dir)
+        assert paths[1] == tmp_path / "nested" / "videos"
+
+
+class TestUploadToMlflow:
+    @staticmethod
+    def _inject_mock_modules(monkeypatch):
+        """Inject mock mlflow and training.utils into sys.modules."""
+        mock_mlflow = MagicMock()
+        mock_run = MagicMock()
+        mock_run.info.run_id = "run-123"
+        mock_mlflow.start_run.return_value.__enter__ = MagicMock(return_value=mock_run)
+        mock_mlflow.start_run.return_value.__exit__ = MagicMock(return_value=False)
+
+        config_error = type("AzureConfigError", (RuntimeError,), {})
+        mock_utils = MagicMock()
+        mock_utils.AzureConfigError = config_error
+
+        context = MagicMock()
+        context.workspace_name = "ws"
+        context.tracking_uri = "https://tracking"
+        context.storage = None
+        mock_utils.bootstrap_azure_ml.return_value = context
+
+        monkeypatch.setitem(sys.modules, "mlflow", mock_mlflow)
+        monkeypatch.setitem(sys.modules, "training", MagicMock())
+        monkeypatch.setitem(sys.modules, "training.utils", mock_utils)
+
+        return mock_mlflow, mock_utils, config_error
+
+    def test_import_error_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setitem(sys.modules, "mlflow", None)
+        result = upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="",
+            onnx_success=False,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_azure_config_error_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        _, mock_utils, config_error = self._inject_mock_modules(monkeypatch)
+        mock_utils.bootstrap_azure_ml.side_effect = config_error("nope")
+        result = upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="",
+            onnx_success=False,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_connection_error_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        self._inject_mock_modules(monkeypatch)
+        sys.modules["training.utils"].bootstrap_azure_ml.side_effect = ConnectionError("refused")
+        result = upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="",
+            onnx_success=False,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_success_returns_true(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "both")
+
+        mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch)
+        result = upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=True,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert result is True
+        mock_mlflow.start_run.assert_called_once()
+        mock_mlflow.set_tags.assert_called_once()
+        mock_mlflow.log_params.assert_called_once()
+
+    def test_onnx_metrics_logged(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "onnx")
+
+        mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch)
+        upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=True,
+            jit_success=False,
+            onnx_metrics={"mean_episode_reward": 5.0, "total_episodes": 10},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert mock_mlflow.log_metrics.call_count >= 2
+        mock_mlflow.log_artifact.assert_called()
+
+    def test_storage_upload_called(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "both")
+
+        _, mock_utils, _ = self._inject_mock_modules(monkeypatch)
+        mock_storage = MagicMock()
+        mock_utils.bootstrap_azure_ml.return_value.storage = mock_storage
+
+        (tmp_path / "policy.onnx").write_bytes(b"data")
+        upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=True,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        mock_storage.upload_files_batch.assert_called_once()
+
+    def test_storage_no_files_to_upload(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "both")
+
+        _, mock_utils, _ = self._inject_mock_modules(monkeypatch)
+        mock_storage = MagicMock()
+        mock_utils.bootstrap_azure_ml.return_value.storage = mock_storage
+
+        export_dir = tmp_path / "empty_export"
+        export_dir.mkdir()
+        result = upload_to_mlflow(
+            task="t",
+            export_dir=export_dir,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=False,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert result is True
+        mock_storage.upload_files_batch.assert_not_called()
+
+    def test_jit_metrics_logged(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "jit")
+
+        mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch)
+        upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=False,
+            jit_success=True,
+            onnx_metrics={},
+            jit_metrics={"mean_episode_reward": 7.0, "total_episodes": 5},
+            timestamp="20240101_000000",
+        )
+
+        logged_keys = set()
+        for call in mock_mlflow.log_metrics.call_args_list:
+            logged_keys.update(call[0][0])
+        assert "jit/mean_episode_reward" in logged_keys
+        mock_mlflow.log_artifact.assert_called()
+
+    def test_video_path_classification(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "both")
+
+        export_dir = tmp_path / "export"
+        videos_dir = export_dir / "videos"
+        videos_dir.mkdir(parents=True)
+        (videos_dir / "onnx_run.mp4").write_bytes(b"\x00")
+        (videos_dir / "jit_run.mp4").write_bytes(b"\x00")
+        (videos_dir / "general_run.mp4").write_bytes(b"\x00")
+
+        mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch)
+        upload_to_mlflow(
+            task="t",
+            export_dir=export_dir,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=True,
+            jit_success=True,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+
+        artifact_paths = [c.kwargs["artifact_path"] for c in mock_mlflow.log_artifact.call_args_list]
+        assert "videos/onnx" in artifact_paths
+        assert "videos/jit" in artifact_paths
+        assert "videos" in artifact_paths
+
+    def test_storage_upload_includes_videos(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "both")
+
+        _, mock_utils, _ = self._inject_mock_modules(monkeypatch)
+        mock_storage = MagicMock()
+        mock_utils.bootstrap_azure_ml.return_value.storage = mock_storage
+
+        export_dir = tmp_path / "export"
+        export_dir.mkdir()
+        (export_dir / "policy.onnx").write_bytes(b"data")
+        videos_dir = export_dir / "videos"
+        videos_dir.mkdir()
+        (videos_dir / "test.mp4").write_bytes(b"\x00")
+
+        upload_to_mlflow(
+            task="t",
+            export_dir=export_dir,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=True,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+
+        files = mock_storage.upload_files_batch.call_args[0][0]
+        blob_names = [f[1] for f in files]
+        assert any("models/policy.onnx" in n for n in blob_names)
+        assert any("videos/test.mp4" in n for n in blob_names)
+
+    def test_mlflow_run_exception_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("NUM_ENVS", "4")
+        monkeypatch.setenv("MAX_STEPS", "500")
+        monkeypatch.setenv("VIDEO_LENGTH", "200")
+        monkeypatch.setenv("INFERENCE_FORMAT", "both")
+
+        mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch)
+        mock_mlflow.start_run.side_effect = RuntimeError("boom")
+
+        result = upload_to_mlflow(
+            task="t",
+            export_dir=tmp_path,
+            metrics_dir=tmp_path,
+            checkpoint_uri="uri",
+            onnx_success=True,
+            jit_success=False,
+            onnx_metrics={},
+            jit_metrics={},
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+
+class TestUploadToBlobFallback:
+    @staticmethod
+    def _inject_azure_mocks(monkeypatch):
+        """Inject mock azure.identity and azure.storage.blob into sys.modules."""
+        mock_credential = MagicMock()
+        mock_identity = MagicMock()
+        mock_identity.DefaultAzureCredential.return_value = mock_credential
+
+        mock_container = MagicMock()
+        mock_blob_service = MagicMock()
+        mock_blob_service.get_container_client.return_value = mock_container
+
+        mock_blob = MagicMock()
+        mock_blob.BlobServiceClient.return_value = mock_blob_service
+
+        monkeypatch.setitem(sys.modules, "azure", MagicMock())
+        monkeypatch.setitem(sys.modules, "azure.identity", mock_identity)
+        monkeypatch.setitem(sys.modules, "azure.storage", MagicMock())
+        monkeypatch.setitem(sys.modules, "azure.storage.blob", mock_blob)
+
+        return mock_blob, mock_container
+
+    def test_no_storage_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.delenv("AZURE_STORAGE_ACCOUNT_NAME", raising=False)
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="",
+            blob_container="",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_import_error_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setitem(sys.modules, "azure", None)
+        monkeypatch.setitem(sys.modules, "azure.identity", None)
+        monkeypatch.setitem(sys.modules, "azure.storage", None)
+        monkeypatch.setitem(sys.modules, "azure.storage.blob", None)
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="myaccount",
+            blob_container="mycontainer",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_url_parsing_extracts_account(self, tmp_path: Path, monkeypatch) -> None:
+        mock_blob, _ = self._inject_azure_mocks(monkeypatch)
+        upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="",
+            blob_container="",
+            checkpoint_uri="https://myaccount.blob.core.windows.net/mycontainer/path/model.pt",
+            timestamp="20240101_000000",
+        )
+        call_args = str(mock_blob.BlobServiceClient.call_args)
+        assert "myaccount" in call_args
+
+    def test_https_url_non_blob_falls_through(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.delenv("AZURE_STORAGE_ACCOUNT_NAME", raising=False)
+        monkeypatch.delenv("AZURE_STORAGE_CONTAINER_NAME", raising=False)
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="",
+            blob_container="",
+            checkpoint_uri="https://example.com/not-a-blob/file",
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_env_var_fallback(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setenv("AZURE_STORAGE_ACCOUNT_NAME", "envaccount")
+        monkeypatch.setenv("AZURE_STORAGE_CONTAINER_NAME", "envcontainer")
+        mock_blob, _ = self._inject_azure_mocks(monkeypatch)
+        upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="",
+            blob_container="",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        call_args = str(mock_blob.BlobServiceClient.call_args)
+        assert "envaccount" in call_args
+
+    def test_success_with_policy_files(self, tmp_path: Path, monkeypatch) -> None:
+        (tmp_path / "policy.onnx").write_bytes(b"model-data")
+        _, mock_container = self._inject_azure_mocks(monkeypatch)
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="acct",
+            blob_container="ctr",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is True
+        mock_container.upload_blob.assert_called()
+
+    def test_no_files_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        self._inject_azure_mocks(monkeypatch)
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="acct",
+            blob_container="ctr",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+    def test_per_file_upload_exception_continues(self, tmp_path: Path, monkeypatch) -> None:
+        (tmp_path / "policy.onnx").write_bytes(b"model-onnx")
+        (tmp_path / "policy.jit").write_bytes(b"model-jit")
+        _, mock_container = self._inject_azure_mocks(monkeypatch)
+        mock_container.upload_blob.side_effect = [Exception("fail"), None]
+
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="acct",
+            blob_container="ctr",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is True
+        assert mock_container.upload_blob.call_count == 2
+
+    def test_video_upload_success_and_exception(self, tmp_path: Path, monkeypatch) -> None:
+        (tmp_path / "policy.onnx").write_bytes(b"model-data")
+        videos_dir = tmp_path / "videos"
+        videos_dir.mkdir()
+        (videos_dir / "ep1.mp4").write_bytes(b"video-1")
+        (videos_dir / "ep2.mp4").write_bytes(b"video-2")
+        _, mock_container = self._inject_azure_mocks(monkeypatch)
+
+        # First call (model) succeeds, then one video succeeds and one raises.
+        mock_container.upload_blob.side_effect = [None, None, Exception("video-fail")]
+
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="acct",
+            blob_container="ctr",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is True
+        # 1 model upload + 2 video upload attempts.
+        assert mock_container.upload_blob.call_count == 3
+        blob_names = [call.kwargs.get("name", "") for call in mock_container.upload_blob.call_args_list]
+        assert any("videos/ep1.mp4" in name for name in blob_names)
+        assert any("videos/ep2.mp4" in name for name in blob_names)
+
+    def test_credential_exception_returns_false(self, tmp_path: Path, monkeypatch) -> None:
+        mock_identity = MagicMock()
+        mock_identity.DefaultAzureCredential.side_effect = Exception("auth failed")
+        monkeypatch.setitem(sys.modules, "azure", MagicMock())
+        monkeypatch.setitem(sys.modules, "azure.identity", mock_identity)
+        monkeypatch.setitem(sys.modules, "azure.storage", MagicMock())
+        monkeypatch.setitem(sys.modules, "azure.storage.blob", MagicMock())
+
+        result = upload_to_blob_fallback(
+            task="t",
+            export_dir=tmp_path,
+            blob_account="acct",
+            blob_container="ctr",
+            checkpoint_uri="",
+            timestamp="20240101_000000",
+        )
+        assert result is False
+
+
+class TestMain:
+    """Tests for the main() entry point."""
+
+    @staticmethod
+    def _env_vars(tmp_path: Path) -> dict[str, str]:
+        return {
+            "TASK": "pick_place",
+            "EXPORT_DIR": str(tmp_path / "exported"),
+            "METRICS_DIR": str(tmp_path / "metrics"),
+            "ONNX_SUCCESS": "1",
+            "JIT_SUCCESS": "0",
+            "NUM_ENVS": "4",
+            "MAX_STEPS": "500",
+            "VIDEO_LENGTH": "200",
+            "INFERENCE_FORMAT": "both",
+            "CHECKPOINT_URI": "https://example.com/model.pt",
+            "BLOB_STORAGE_ACCOUNT": "acct",
+            "BLOB_CONTAINER": "ctr",
+        }
+
+    def test_mlflow_success_skips_blob_fallback(self, tmp_path: Path, monkeypatch) -> None:
+        mock_set_defaults = MagicMock()
+        monkeypatch.setitem(sys.modules, "training", MagicMock())
+        monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=mock_set_defaults))
+
+        for key, val in self._env_vars(tmp_path).items():
+            monkeypatch.setenv(key, val)
+
+        with (
+            patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})) as mock_load,
+            patch("metrics.upload_artifacts.upload_to_mlflow", return_value=True) as mock_mlflow,
+            patch("metrics.upload_artifacts.upload_to_blob_fallback") as mock_blob,
+        ):
+            main()
+
+            mock_set_defaults.assert_called_once()
+            mock_load.assert_called_once()
+            mock_mlflow.assert_called_once()
+            mock_blob.assert_not_called()
+
+    def test_mlflow_failure_triggers_blob_fallback(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setitem(sys.modules, "training", MagicMock())
+        monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=MagicMock()))
+
+        for key, val in self._env_vars(tmp_path).items():
+            monkeypatch.setenv(key, val)
+
+        with (
+            patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})),
+            patch("metrics.upload_artifacts.upload_to_mlflow", return_value=False),
+            patch("metrics.upload_artifacts.upload_to_blob_fallback") as mock_blob,
+        ):
+            main()
+
+            mock_blob.assert_called_once()
+            call_kwargs = mock_blob.call_args[1]
+            assert call_kwargs["task"] == "pick_place"
+            assert call_kwargs["blob_account"] == "acct"
+            assert call_kwargs["blob_container"] == "ctr"
+
+    def test_env_defaults_passed_correctly(self, tmp_path: Path, monkeypatch) -> None:
+        mock_set_defaults = MagicMock()
+        monkeypatch.setitem(sys.modules, "training", MagicMock())
+        monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=mock_set_defaults))
+
+        for key, val in self._env_vars(tmp_path).items():
+            monkeypatch.setenv(key, val)
+
+        with (
+            patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})),
+            patch("metrics.upload_artifacts.upload_to_mlflow", return_value=True),
+        ):
+            main()
+
+            defaults = mock_set_defaults.call_args[0][0]
+            assert defaults["TASK"] == "unknown"
+            assert defaults["EXPORT_DIR"] == "/tmp/exported"
+            assert defaults["NUM_ENVS"] == "4"
+
+    def test_onnx_and_jit_flags_parsed(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setitem(sys.modules, "training", MagicMock())
+        monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=MagicMock()))
+
+        env = self._env_vars(tmp_path)
+        env["ONNX_SUCCESS"] = "1"
+        env["JIT_SUCCESS"] = "1"
+        for key, val in env.items():
+            monkeypatch.setenv(key, val)
+
+        with (
+            patch("metrics.upload_artifacts.load_metrics", return_value=({"a": 1}, {"b": 2})),
+            patch("metrics.upload_artifacts.upload_to_mlflow", return_value=True) as mock_mlflow,
+        ):
+            main()
+
+            call_kwargs = mock_mlflow.call_args[1]
+            assert call_kwargs["onnx_success"] is True
+            assert call_kwargs["jit_success"] is True
+            assert call_kwargs["onnx_metrics"] == {"a": 1}
+            assert call_kwargs["jit_metrics"] == {"b": 2}
+
+    def test_optional_env_vars_default_to_empty(self, tmp_path: Path, monkeypatch) -> None:
+        monkeypatch.setitem(sys.modules, "training", MagicMock())
+        monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=MagicMock()))
+
+        env = self._env_vars(tmp_path)
+        del env["CHECKPOINT_URI"]
+        del env["BLOB_STORAGE_ACCOUNT"]
+        del env["BLOB_CONTAINER"]
+        del env["METRICS_DIR"]
+        monkeypatch.delenv("CHECKPOINT_URI", raising=False)
+        monkeypatch.delenv("BLOB_STORAGE_ACCOUNT", raising=False)
+        monkeypatch.delenv("BLOB_CONTAINER", raising=False)
+        monkeypatch.delenv("METRICS_DIR", raising=False)
+        for key, val in env.items():
+            monkeypatch.setenv(key, val)
+
+        with (
+            patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})),
+            patch("metrics.upload_artifacts.upload_to_mlflow", return_value=False),
+            patch("metrics.upload_artifacts.upload_to_blob_fallback") as mock_blob,
+        ):
+            main()
+
+            call_kwargs = mock_blob.call_args[1]
+            assert call_kwargs["checkpoint_uri"] == ""
+            assert call_kwargs["blob_account"] == ""
+            assert call_kwargs["blob_container"] == ""
diff --git a/evaluation/uv.lock b/evaluation/uv.lock
index 6ef6c0f3..c21f2286 100644
--- a/evaluation/uv.lock
+++ b/evaluation/uv.lock
@@ -698,6 +698,90 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831, upload-time = "2025-07-26T12:02:51.449Z" },
 ]
 
+[[package]]
+name = "coverage"
+version = "7.13.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967, upload-time = "2026-03-17T10:33:18.341Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554, upload-time = "2026-03-17T10:30:42.208Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908, upload-time = "2026-03-17T10:30:43.906Z" },
+    { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419, upload-time = "2026-03-17T10:30:45.545Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/49/cd14b789536ac6a4778c453c6a2338bc0a2fb60c5a5a41b4008328b9acc1/coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5", size = 254159, upload-time = "2026-03-17T10:30:47.204Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/00/7b0edcfe64e2ed4c0340dac14a52ad0f4c9bd0b8b5e531af7d55b703db7c/coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376", size = 255270, upload-time = "2026-03-17T10:30:48.812Z" },
+    { url = "https://files.pythonhosted.org/packages/93/89/7ffc4ba0f5d0a55c1e84ea7cee39c9fc06af7b170513d83fbf3bbefce280/coverage-7.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:012d5319e66e9d5a218834642d6c35d265515a62f01157a45bcc036ecf947256", size = 257538, upload-time = "2026-03-17T10:30:50.77Z" },
+    { url = "https://files.pythonhosted.org/packages/81/bd/73ddf85f93f7e6fa83e77ccecb6162d9415c79007b4bc124008a4995e4a7/coverage-7.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8dd02af98971bdb956363e4827d34425cb3df19ee550ef92855b0acb9c7ce51c", size = 251821, upload-time = "2026-03-17T10:30:52.5Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/81/278aff4e8dec4926a0bcb9486320752811f543a3ce5b602cc7a29978d073/coverage-7.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f08fd75c50a760c7eb068ae823777268daaf16a80b918fa58eea888f8e3919f5", size = 253191, upload-time = "2026-03-17T10:30:54.543Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ee/fe1621488e2e0a58d7e94c4800f0d96f79671553488d401a612bebae324b/coverage-7.13.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:843ea8643cf967d1ac7e8ecd4bb00c99135adf4816c0c0593fdcc47b597fcf09", size = 251337, upload-time = "2026-03-17T10:30:56.663Z" },
+    { url = "https://files.pythonhosted.org/packages/37/a6/f79fb37aa104b562207cc23cb5711ab6793608e246cae1e93f26b2236ed9/coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9", size = 255404, upload-time = "2026-03-17T10:30:58.427Z" },
+    { url = "https://files.pythonhosted.org/packages/75/f0/ed15262a58ec81ce457ceb717b7f78752a1713556b19081b76e90896e8d4/coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf", size = 250903, upload-time = "2026-03-17T10:31:00.093Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/e9/9129958f20e7e9d4d56d51d42ccf708d15cac355ff4ac6e736e97a9393d2/coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c", size = 252780, upload-time = "2026-03-17T10:31:01.916Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/d7/0ad9b15812d81272db94379fe4c6df8fd17781cc7671fdfa30c76ba5ff7b/coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf", size = 222093, upload-time = "2026-03-17T10:31:03.642Z" },
+    { url = "https://files.pythonhosted.org/packages/29/3d/821a9a5799fac2556bcf0bd37a70d1d11fa9e49784b6d22e92e8b2f85f18/coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810", size = 222900, upload-time = "2026-03-17T10:31:05.651Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/fa/2238c2ad08e35cf4f020ea721f717e09ec3152aea75d191a7faf3ef009a8/coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de", size = 221515, upload-time = "2026-03-17T10:31:07.293Z" },
+    { url = "https://files.pythonhosted.org/packages/74/8c/74fedc9663dcf168b0a059d4ea756ecae4da77a489048f94b5f512a8d0b3/coverage-7.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ec4af212df513e399cf11610cc27063f1586419e814755ab362e50a85ea69c1", size = 219576, upload-time = "2026-03-17T10:31:09.045Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/c9/44fb661c55062f0818a6ffd2685c67aa30816200d5f2817543717d4b92eb/coverage-7.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:941617e518602e2d64942c88ec8499f7fbd49d3f6c4327d3a71d43a1973032f3", size = 219942, upload-time = "2026-03-17T10:31:10.708Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/13/93419671cee82b780bab7ea96b67c8ef448f5f295f36bf5031154ec9a790/coverage-7.13.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:da305e9937617ee95c2e39d8ff9f040e0487cbf1ac174f777ed5eddd7a7c1f26", size = 250935, upload-time = "2026-03-17T10:31:12.392Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/68/1666e3a4462f8202d836920114fa7a5ee9275d1fa45366d336c551a162dd/coverage-7.13.5-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:78e696e1cc714e57e8b25760b33a8b1026b7048d270140d25dafe1b0a1ee05a3", size = 253541, upload-time = "2026-03-17T10:31:14.247Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/5e/3ee3b835647be646dcf3c65a7c6c18f87c27326a858f72ab22c12730773d/coverage-7.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02ca0eed225b2ff301c474aeeeae27d26e2537942aa0f87491d3e147e784a82b", size = 254780, upload-time = "2026-03-17T10:31:16.193Z" },
+    { url = "https://files.pythonhosted.org/packages/44/b3/cb5bd1a04cfcc49ede6cd8409d80bee17661167686741e041abc7ee1b9a9/coverage-7.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:04690832cbea4e4663d9149e05dba142546ca05cb1848816760e7f58285c970a", size = 256912, upload-time = "2026-03-17T10:31:17.89Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/66/c1dceb7b9714473800b075f5c8a84f4588f887a90eb8645282031676e242/coverage-7.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0590e44dd2745c696a778f7bab6aa95256de2cbc8b8cff4f7db8ff09813d6969", size = 251165, upload-time = "2026-03-17T10:31:19.605Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/62/5502b73b97aa2e53ea22a39cf8649ff44827bef76d90bf638777daa27a9d/coverage-7.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d7cfad2d6d81dd298ab6b89fe72c3b7b05ec7544bdda3b707ddaecff8d25c161", size = 252908, upload-time = "2026-03-17T10:31:21.312Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/37/7792c2d69854397ca77a55c4646e5897c467928b0e27f2d235d83b5d08c6/coverage-7.13.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e092b9499de38ae0fbfbc603a74660eb6ff3e869e507b50d85a13b6db9863e15", size = 250873, upload-time = "2026-03-17T10:31:23.565Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/23/bc866fb6163be52a8a9e5d708ba0d3b1283c12158cefca0a8bbb6e247a43/coverage-7.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:48c39bc4a04d983a54a705a6389512883d4a3b9862991b3617d547940e9f52b1", size = 255030, upload-time = "2026-03-17T10:31:25.58Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/8b/ef67e1c222ef49860701d346b8bbb70881bef283bd5f6cbba68a39a086c7/coverage-7.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2d3807015f138ffea1ed9afeeb8624fd781703f2858b62a8dd8da5a0994c57b6", size = 250694, upload-time = "2026-03-17T10:31:27.316Z" },
+    { url = "https://files.pythonhosted.org/packages/46/0d/866d1f74f0acddbb906db212e096dee77a8e2158ca5e6bb44729f9d93298/coverage-7.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee2aa19e03161671ec964004fb74b2257805d9710bf14a5c704558b9d8dbaf17", size = 252469, upload-time = "2026-03-17T10:31:29.472Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/f5/be742fec31118f02ce42b21c6af187ad6a344fed546b56ca60caacc6a9a0/coverage-7.13.5-cp313-cp313-win32.whl", hash = "sha256:ce1998c0483007608c8382f4ff50164bfc5bd07a2246dd272aa4043b75e61e85", size = 222112, upload-time = "2026-03-17T10:31:31.526Z" },
+    { url = "https://files.pythonhosted.org/packages/66/40/7732d648ab9d069a46e686043241f01206348e2bbf128daea85be4d6414b/coverage-7.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:631efb83f01569670a5e866ceb80fe483e7c159fac6f167e6571522636104a0b", size = 222923, upload-time = "2026-03-17T10:31:33.633Z" },
+    { url = "https://files.pythonhosted.org/packages/48/af/fea819c12a095781f6ccd504890aaddaf88b8fab263c4940e82c7b770124/coverage-7.13.5-cp313-cp313-win_arm64.whl", hash = "sha256:f4cd16206ad171cbc2470dbea9103cf9a7607d5fe8c242fdf1edf36174020664", size = 221540, upload-time = "2026-03-17T10:31:35.445Z" },
+    { url = "https://files.pythonhosted.org/packages/23/d2/17879af479df7fbbd44bd528a31692a48f6b25055d16482fdf5cdb633805/coverage-7.13.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0428cbef5783ad91fe240f673cc1f76b25e74bbfe1a13115e4aa30d3f538162d", size = 220262, upload-time = "2026-03-17T10:31:37.184Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/4c/d20e554f988c8f91d6a02c5118f9abbbf73a8768a3048cb4962230d5743f/coverage-7.13.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e0b216a19534b2427cc201a26c25da4a48633f29a487c61258643e89d28200c0", size = 220617, upload-time = "2026-03-17T10:31:39.245Z" },
+    { url = "https://files.pythonhosted.org/packages/29/9c/f9f5277b95184f764b24e7231e166dfdb5780a46d408a2ac665969416d61/coverage-7.13.5-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:972a9cd27894afe4bc2b1480107054e062df08e671df7c2f18c205e805ccd806", size = 261912, upload-time = "2026-03-17T10:31:41.324Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/f6/7f1ab39393eeb50cfe4747ae8ef0e4fc564b989225aa1152e13a180d74f8/coverage-7.13.5-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4b59148601efcd2bac8c4dbf1f0ad6391693ccf7a74b8205781751637076aee3", size = 263987, upload-time = "2026-03-17T10:31:43.724Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/d7/62c084fb489ed9c6fbdf57e006752e7c516ea46fd690e5ed8b8617c7d52e/coverage-7.13.5-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:505d7083c8b0c87a8fa8c07370c285847c1f77739b22e299ad75a6af6c32c5c9", size = 266416, upload-time = "2026-03-17T10:31:45.769Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/f6/df63d8660e1a0bff6125947afda112a0502736f470d62ca68b288ea762d8/coverage-7.13.5-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:60365289c3741e4db327e7baff2a4aaacf22f788e80fa4683393891b70a89fbd", size = 267558, upload-time = "2026-03-17T10:31:48.293Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/02/353ca81d36779bd108f6d384425f7139ac3c58c750dcfaafe5d0bee6436b/coverage-7.13.5-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1b88c69c8ef5d4b6fe7dea66d6636056a0f6a7527c440e890cf9259011f5e606", size = 261163, upload-time = "2026-03-17T10:31:50.125Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/16/2e79106d5749bcaf3aee6d309123548e3276517cd7851faa8da213bc61bf/coverage-7.13.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5b13955d31d1633cf9376908089b7cebe7d15ddad7aeaabcbe969a595a97e95e", size = 263981, upload-time = "2026-03-17T10:31:51.961Z" },
+    { url = "https://files.pythonhosted.org/packages/29/c7/c29e0c59ffa6942030ae6f50b88ae49988e7e8da06de7ecdbf49c6d4feae/coverage-7.13.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f70c9ab2595c56f81a89620e22899eea8b212a4041bd728ac6f4a28bf5d3ddd0", size = 261604, upload-time = "2026-03-17T10:31:53.872Z" },
+    { url = "https://files.pythonhosted.org/packages/40/48/097cdc3db342f34006a308ab41c3a7c11c3f0d84750d340f45d88a782e00/coverage-7.13.5-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:084b84a8c63e8d6fc7e3931b316a9bcafca1458d753c539db82d31ed20091a87", size = 265321, upload-time = "2026-03-17T10:31:55.997Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/1f/4994af354689e14fd03a75f8ec85a9a68d94e0188bbdab3fc1516b55e512/coverage-7.13.5-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad14385487393e386e2ea988b09d62dd42c397662ac2dabc3832d71253eee479", size = 260502, upload-time = "2026-03-17T10:31:58.308Z" },
+    { url = "https://files.pythonhosted.org/packages/22/c6/9bb9ef55903e628033560885f5c31aa227e46878118b63ab15dc7ba87797/coverage-7.13.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f2c47b36fe7709a6e83bfadf4eefb90bd25fbe4014d715224c4316f808e59a2", size = 262688, upload-time = "2026-03-17T10:32:00.141Z" },
+    { url = "https://files.pythonhosted.org/packages/14/4f/f5df9007e50b15e53e01edea486814783a7f019893733d9e4d6caad75557/coverage-7.13.5-cp313-cp313t-win32.whl", hash = "sha256:67e9bc5449801fad0e5dff329499fb090ba4c5800b86805c80617b4e29809b2a", size = 222788, upload-time = "2026-03-17T10:32:02.246Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/98/aa7fccaa97d0f3192bec013c4e6fd6d294a6ed44b640e6bb61f479e00ed5/coverage-7.13.5-cp313-cp313t-win_amd64.whl", hash = "sha256:da86cdcf10d2519e10cabb8ac2de03da1bcb6e4853790b7fbd48523332e3a819", size = 223851, upload-time = "2026-03-17T10:32:04.416Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/8b/e5c469f7352651e5f013198e9e21f97510b23de957dd06a84071683b4b60/coverage-7.13.5-cp313-cp313t-win_arm64.whl", hash = "sha256:0ecf12ecb326fe2c339d93fc131816f3a7367d223db37817208905c89bded911", size = 222104, upload-time = "2026-03-17T10:32:06.65Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/77/39703f0d1d4b478bfd30191d3c14f53caf596fac00efb3f8f6ee23646439/coverage-7.13.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fbabfaceaeb587e16f7008f7795cd80d20ec548dc7f94fbb0d4ec2e038ce563f", size = 219621, upload-time = "2026-03-17T10:32:08.589Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/3e/51dff36d99ae14639a133d9b164d63e628532e2974d8b1edb99dd1ebc733/coverage-7.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9bb2a28101a443669a423b665939381084412b81c3f8c0fcfbac57f4e30b5b8e", size = 219953, upload-time = "2026-03-17T10:32:10.507Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/6c/1f1917b01eb647c2f2adc9962bd66c79eb978951cab61bdc1acab3290c07/coverage-7.13.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bd3a2fbc1c6cccb3c5106140d87cc6a8715110373ef42b63cf5aea29df8c217a", size = 250992, upload-time = "2026-03-17T10:32:12.41Z" },
+    { url = "https://files.pythonhosted.org/packages/22/e5/06b1f88f42a5a99df42ce61208bdec3bddb3d261412874280a19796fc09c/coverage-7.13.5-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6c36ddb64ed9d7e496028d1d00dfec3e428e0aabf4006583bb1839958d280510", size = 253503, upload-time = "2026-03-17T10:32:14.449Z" },
+    { url = "https://files.pythonhosted.org/packages/80/28/2a148a51e5907e504fa7b85490277734e6771d8844ebcc48764a15e28155/coverage-7.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:380e8e9084d8eb38db3a9176a1a4f3c0082c3806fa0dc882d1d87abc3c789247", size = 254852, upload-time = "2026-03-17T10:32:16.56Z" },
+    { url = "https://files.pythonhosted.org/packages/61/77/50e8d3d85cc0b7ebe09f30f151d670e302c7ff4a1bf6243f71dd8b0981fa/coverage-7.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e808af52a0513762df4d945ea164a24b37f2f518cbe97e03deaa0ee66139b4d6", size = 257161, upload-time = "2026-03-17T10:32:19.004Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/c4/b5fd1d4b7bf8d0e75d997afd3925c59ba629fc8616f1b3aae7605132e256/coverage-7.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e301d30dd7e95ae068671d746ba8c34e945a82682e62918e41b2679acd2051a0", size = 251021, upload-time = "2026-03-17T10:32:21.344Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/66/6ea21f910e92d69ef0b1c3346ea5922a51bad4446c9126db2ae96ee24c4c/coverage-7.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:800bc829053c80d240a687ceeb927a94fd108bbdc68dfbe505d0d75ab578a882", size = 252858, upload-time = "2026-03-17T10:32:23.506Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/ea/879c83cb5d61aa2a35fb80e72715e92672daef8191b84911a643f533840c/coverage-7.13.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:0b67af5492adb31940ee418a5a655c28e48165da5afab8c7fa6fd72a142f8740", size = 250823, upload-time = "2026-03-17T10:32:25.516Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/fb/616d95d3adb88b9803b275580bdeee8bd1b69a886d057652521f83d7322f/coverage-7.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c9136ff29c3a91e25b1d1552b5308e53a1e0653a23e53b6366d7c2dcbbaf8a16", size = 255099, upload-time = "2026-03-17T10:32:27.944Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/93/25e6917c90ec1c9a56b0b26f6cad6408e5f13bb6b35d484a0d75c9cf000d/coverage-7.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:cff784eef7f0b8f6cb28804fbddcfa99f89efe4cc35fb5627e3ac58f91ed3ac0", size = 250638, upload-time = "2026-03-17T10:32:29.914Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/7b/dc1776b0464145a929deed214aef9fb1493f159b59ff3c7eeeedf91eddd0/coverage-7.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:68a4953be99b17ac3c23b6efbc8a38330d99680c9458927491d18700ef23ded0", size = 252295, upload-time = "2026-03-17T10:32:31.981Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/fb/99cbbc56a26e07762a2740713f3c8f9f3f3106e3a3dd8cc4474954bccd34/coverage-7.13.5-cp314-cp314-win32.whl", hash = "sha256:35a31f2b1578185fbe6aa2e74cea1b1d0bbf4c552774247d9160d29b80ed56cc", size = 222360, upload-time = "2026-03-17T10:32:34.233Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/b7/4758d4f73fb536347cc5e4ad63662f9d60ba9118cb6785e9616b2ce5d7fa/coverage-7.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:2aa055ae1857258f9e0045be26a6d62bdb47a72448b62d7b55f4820f361a2633", size = 223174, upload-time = "2026-03-17T10:32:36.369Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/f2/24d84e1dfe70f8ac9fdf30d338239860d0d1d5da0bda528959d0ebc9da28/coverage-7.13.5-cp314-cp314-win_arm64.whl", hash = "sha256:1b11eef33edeae9d142f9b4358edb76273b3bfd30bc3df9a4f95d0e49caf94e8", size = 221739, upload-time = "2026-03-17T10:32:38.736Z" },
+    { url = "https://files.pythonhosted.org/packages/60/5b/4a168591057b3668c2428bff25dd3ebc21b629d666d90bcdfa0217940e84/coverage-7.13.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10a0c37f0b646eaff7cce1874c31d1f1ccb297688d4c747291f4f4c70741cc8b", size = 220351, upload-time = "2026-03-17T10:32:41.196Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/21/1fd5c4dbfe4a58b6b99649125635df46decdfd4a784c3cd6d410d303e370/coverage-7.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b5db73ba3c41c7008037fa731ad5459fc3944cb7452fc0aa9f822ad3533c583c", size = 220612, upload-time = "2026-03-17T10:32:43.204Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/fe/2a924b3055a5e7e4512655a9d4609781b0d62334fa0140c3e742926834e2/coverage-7.13.5-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:750db93a81e3e5a9831b534be7b1229df848b2e125a604fe6651e48aa070e5f9", size = 261985, upload-time = "2026-03-17T10:32:45.514Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/0d/c8928f2bd518c45990fe1a2ab8db42e914ef9b726c975facc4282578c3eb/coverage-7.13.5-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9ddb4f4a5479f2539644be484da179b653273bca1a323947d48ab107b3ed1f29", size = 264107, upload-time = "2026-03-17T10:32:47.971Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/ae/4ae35bbd9a0af9d820362751f0766582833c211224b38665c0f8de3d487f/coverage-7.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8a7a2049c14f413163e2bdabd37e41179b1d1ccb10ffc6ccc4b7a718429c607", size = 266513, upload-time = "2026-03-17T10:32:50.1Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/20/d326174c55af36f74eac6ae781612d9492f060ce8244b570bb9d50d9d609/coverage-7.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1c85e0b6c05c592ea6d8768a66a254bfb3874b53774b12d4c89c481eb78cb90", size = 267650, upload-time = "2026-03-17T10:32:52.391Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/5e/31484d62cbd0eabd3412e30d74386ece4a0837d4f6c3040a653878bfc019/coverage-7.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:777c4d1eff1b67876139d24288aaf1817f6c03d6bae9c5cc8d27b83bcfe38fe3", size = 261089, upload-time = "2026-03-17T10:32:54.544Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/d8/49a72d6de146eebb0b7e48cc0f4bc2c0dd858e3d4790ab2b39a2872b62bd/coverage-7.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6697e29b93707167687543480a40f0db8f356e86d9f67ddf2e37e2dfd91a9dab", size = 263982, upload-time = "2026-03-17T10:32:56.803Z" },
+    { url = "https://files.pythonhosted.org/packages/06/3b/0351f1bd566e6e4dd39e978efe7958bde1d32f879e85589de147654f57bb/coverage-7.13.5-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:8fdf453a942c3e4d99bd80088141c4c6960bb232c409d9c3558e2dbaa3998562", size = 261579, upload-time = "2026-03-17T10:32:59.466Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/ce/796a2a2f4017f554d7810f5c573449b35b1e46788424a548d4d19201b222/coverage-7.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:32ca0c0114c9834a43f045a87dcebd69d108d8ffb666957ea65aa132f50332e2", size = 265316, upload-time = "2026-03-17T10:33:01.847Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/16/d5ae91455541d1a78bc90abf495be600588aff8f6db5c8b0dae739fa39c9/coverage-7.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:8769751c10f339021e2638cd354e13adeac54004d1941119b2c96fe5276d45ea", size = 260427, upload-time = "2026-03-17T10:33:03.945Z" },
+    { url = "https://files.pythonhosted.org/packages/48/11/07f413dba62db21fb3fad5d0de013a50e073cc4e2dc4306e770360f6dfc8/coverage-7.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cec2d83125531bd153175354055cdb7a09987af08a9430bd173c937c6d0fba2a", size = 262745, upload-time = "2026-03-17T10:33:06.285Z" },
+    { url = "https://files.pythonhosted.org/packages/91/15/d792371332eb4663115becf4bad47e047d16234b1aff687b1b18c58d60ae/coverage-7.13.5-cp314-cp314t-win32.whl", hash = "sha256:0cd9ed7a8b181775459296e402ca4fb27db1279740a24e93b3b41942ebe4b215", size = 223146, upload-time = "2026-03-17T10:33:08.756Z" },
+    { url = "https://files.pythonhosted.org/packages/db/51/37221f59a111dca5e85be7dbf09696323b5b9f13ff65e0641d535ed06ea8/coverage-7.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:301e3b7dfefecaca37c9f1aa6f0049b7d4ab8dd933742b607765d757aca77d43", size = 224254, upload-time = "2026-03-17T10:33:11.174Z" },
+    { url = "https://files.pythonhosted.org/packages/54/83/6acacc889de8987441aa7d5adfbdbf33d288dad28704a67e574f1df9bcbb/coverage-7.13.5-cp314-cp314t-win_arm64.whl", hash = "sha256:9dacc2ad679b292709e0f5fc1ac74a6d4d5562e424058962c7bb0c658ad25e45", size = 222276, upload-time = "2026-03-17T10:33:13.466Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346, upload-time = "2026-03-17T10:33:15.691Z" },
+]
+
 [[package]]
 name = "cryptography"
 version = "46.0.6"
@@ -1360,6 +1444,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5e/c9/4c1e1216b24bcab140c83acdf8bc89a846ea17cd8a06cd18e3fd308a297f/huggingface_hub-1.10.2-py3-none-any.whl", hash = "sha256:c26c908767cc711493978dc0b4f5747ba7841602997cc98bfd628450a28cf9bc", size = 642581, upload-time = "2026-04-14T10:42:26.563Z" },
 ]
 
+[[package]]
+name = "hypothesis"
+version = "6.151.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "sortedcontainers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/39/da/d9cc191b2fd31a138e9e803c967ec59496e991290d1c986cb74963e577d0/hypothesis-6.151.13.tar.gz", hash = "sha256:ca85e59454d7f36276a7ee99c775acd95e56495d4028b01e5b606a316771890c", size = 463886, upload-time = "2026-04-13T06:32:48.382Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/4d/06c2149d3aa1a0877db55f5dabb0070e046ac0a4b3795397d7c6477e0789/hypothesis-6.151.13-py3-none-any.whl", hash = "sha256:642508683cd59f2b0cd049bbee5029a61104f69621e2652bd2a894221ee424a9", size = 529610, upload-time = "2026-04-13T06:32:46.83Z" },
+]
+
 [[package]]
 name = "idna"
 version = "3.11"
@@ -1414,6 +1510,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
 ]
 
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
 [[package]]
 name = "isodate"
 version = "0.7.2"
@@ -2777,6 +2882,17 @@ dependencies = [
     { name = "torch" },
 ]
 
+[package.dev-dependencies]
+dev = [
+    { name = "hypothesis" },
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pytest" },
+    { name = "pytest-cov" },
+    { name = "pytest-mock" },
+    { name = "torch" },
+]
+
 [package.metadata]
 requires-dist = [
     { name = "azure-ai-ml", specifier = "==1.32.0" },
@@ -2800,6 +2916,17 @@ requires-dist = [
     { name = "torch", specifier = "==2.10.0" },
 ]
 
+[package.metadata.requires-dev]
+dev = [
+    { name = "hypothesis", specifier = "==6.151.13" },
+    { name = "matplotlib", specifier = "==3.10.8" },
+    { name = "numpy", specifier = "==2.2.6" },
+    { name = "pytest", specifier = "==9.0.3" },
+    { name = "pytest-cov", specifier = "==7.1.0" },
+    { name = "pytest-mock", specifier = "==3.15.1" },
+    { name = "torch", specifier = "==2.10.0" },
+]
+
 [[package]]
 name = "pillow"
 version = "12.2.0"
@@ -2878,6 +3005,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" },
 ]
 
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
 [[package]]
 name = "prettytable"
 version = "3.17.0"
@@ -3349,6 +3485,48 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/07/bc/587a445451b253b285629263eb51c2d8e9bcea4fc97826266d186f96f558/pyserial-3.5-py2.py3-none-any.whl", hash = "sha256:c4451db6ba391ca6ca299fb3ec7bae67a5c55dde170964c7a14ceefec02f2cf0", size = 90585, upload-time = "2020-11-23T03:59:13.41Z" },
 ]
 
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
+]
+
+[[package]]
+name = "pytest-cov"
+version = "7.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "coverage" },
+    { name = "pluggy" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" },
+]
+
+[[package]]
+name = "pytest-mock"
+version = "3.15.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/68/14/eb014d26be205d38ad5ad20d9a80f7d201472e08167f0bb4361e251084a9/pytest_mock-3.15.1.tar.gz", hash = "sha256:1849a238f6f396da19762269de72cb1814ab44416fa73a8686deac10b0d87a0f", size = 34036, upload-time = "2025-09-16T16:37:27.081Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/cc/06253936f4a7fa2e0f48dfe6d851d9c56df896a9ab09ac019d70b760619c/pytest_mock-3.15.1-py3-none-any.whl", hash = "sha256:0a25e2eb88fe5168d535041d09a4529a188176ae608a6d249ee65abc0949630d", size = 10095, upload-time = "2025-09-16T16:37:25.734Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -3909,6 +4087,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c1/d4/59e74daffcb57a07668852eeeb6035af9f32cbfd7a1d2511f17d2fe6a738/smmap-5.0.3-py3-none-any.whl", hash = "sha256:c106e05d5a61449cf6ba9a1e650227ecfb141590d2a98412103ff35d89fc7b2f", size = 24390, upload-time = "2026-03-09T03:43:24.361Z" },
 ]
 
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.49"
diff --git a/package.json b/package.json
index 10808e9c..266bedca 100644
--- a/package.json
+++ b/package.json
@@ -40,7 +40,7 @@
     "markdownlint-cli2": "0.22.0"
   },
   "overrides": {
-    "smol-toml": ">=1.6.1"
+    "smol-toml": "1.6.1"
   },
   "repository": {
     "type": "git",