diff --git a/.cspell.json b/.cspell.json index e74e589b..128c516c 100644 --- a/.cspell.json +++ b/.cspell.json @@ -6,6 +6,15 @@ "flagWords": [ "TODO" ], + "words": [ + "envaccount", + "envcontainer", + "myacct", + "mycontainer", + "noseparator", + "preds", + "xticklabels" + ], "ignorePaths": [ "**/.github/chatmodes/**", "**/node_modules/**", diff --git a/.cspell/general-technical.txt b/.cspell/general-technical.txt index 5b92a679..c3c80224 100644 --- a/.cspell/general-technical.txt +++ b/.cspell/general-technical.txt @@ -593,6 +593,7 @@ interoperates interpretability interpretml interzone +intoto intracloud intrazone intune @@ -2025,3 +2026,6 @@ WEBADM WEBACCESS vsmagent vsmserver +envaccount +envcontainer +mycontainer diff --git a/.github/workflows/check-binary-integrity.yml b/.github/workflows/check-binary-integrity.yml index 26f70da2..e2a2aff3 100644 --- a/.github/workflows/check-binary-integrity.yml +++ b/.github/workflows/check-binary-integrity.yml @@ -8,7 +8,6 @@ on: permissions: contents: read - security-events: write concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/.github/workflows/evaluation-pytests.yml b/.github/workflows/evaluation-pytests.yml new file mode 100644 index 00000000..297af9a3 --- /dev/null +++ b/.github/workflows/evaluation-pytests.yml @@ -0,0 +1,62 @@ +name: Evaluation Pytest Tests + +on: + workflow_call: + inputs: + code-coverage: + description: 'Enable Codecov coverage upload' + required: false + default: false + type: boolean + +permissions: + contents: read + +jobs: + pytest: + name: Evaluation Pytest + runs-on: ubuntu-latest + defaults: + run: + working-directory: evaluation + permissions: + contents: read + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Setup Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.12' + + - name: Setup uv + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + + - name: Install dependencies + run: uv sync --only-group dev + + - name: Run pytest + run: uv run --only-group dev pytest -v + + - name: Upload coverage.xml artifact + if: ${{ inputs.code-coverage && !cancelled() }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: evaluation-pytest-coverage-xml + path: evaluation/logs/coverage.xml + retention-days: 30 + + - name: Upload coverage to Codecov + if: ${{ inputs.code-coverage && !cancelled() }} + uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v6.0.0 + with: + files: evaluation/logs/coverage.xml + use_oidc: true + fail_ci_if_error: false + verbose: true + flags: pytest-evaluation + name: evaluation-pytest-coverage diff --git a/.github/workflows/pr-validation.yml b/.github/workflows/pr-validation.yml index a3ccc2cb..c8cce712 100644 --- a/.github/workflows/pr-validation.yml +++ b/.github/workflows/pr-validation.yml @@ -156,6 +156,16 @@ jobs: contents: read id-token: write + # Evaluation domain pytest execution + evaluation-pytests: + name: Evaluation Pytest + uses: ./.github/workflows/evaluation-pytests.yml + with: + code-coverage: true + permissions: + contents: read + id-token: write + # Fuzz regression via deterministic corpus-based tests fuzz-regression-tests: name: Fuzz Regression Tests @@ -254,3 +264,54 @@ jobs: contents: read security-events: write actions: read + + # Aggregator status check required by branch protection. + # Reports success only when every upstream PR validation job succeeded or was skipped. + pr-validation-summary: + name: pr-validation-summary + if: ${{ always() }} + runs-on: ubuntu-latest + needs: + - spell-check + - markdown-lint + - table-format + - frontmatter-validation + - msdate-freshness + - psscriptanalyzer + - yaml-lint + - link-lang-check + - markdown-link-check + - dependency-review + - dependency-pinning + - pester-tests + - dataviewer-frontend-tests + - docusaurus-tests + - pytest-tests + - dataviewer-backend-pytests + - evaluation-pytests + - fuzz-regression-tests + - python-lint + - terraform-lint + - terraform-validation + - terraform-tests + - go-lint + - terraform-docs-check + - go-tests + - shellcheck + - codeql-analysis + permissions: + contents: read + steps: + - name: Verify all upstream jobs succeeded + env: + NEEDS_JSON: ${{ toJSON(needs) }} + run: | + echo "Upstream job results:" + echo "$NEEDS_JSON" + failed=$(echo "$NEEDS_JSON" | jq -r 'to_entries[] | select(.value.result != "success" and .value.result != "skipped") | .key') + if [ -n "$failed" ]; then + echo "::error::One or more upstream PR validation jobs did not succeed:" + echo "$failed" + exit 1 + fi + echo "All upstream PR validation jobs succeeded or were skipped." diff --git a/codecov.yml b/codecov.yml index c11f6f3d..e784a035 100644 --- a/codecov.yml +++ b/codecov.yml @@ -5,6 +5,7 @@ # pester — PowerShell tests covering scripts/ # pytest — Python tests covering training/ # pytest-dataviewer — Dataviewer backend tests covering data-management/viewer/backend/src/ +# pytest-evaluation — Evaluation tests covering evaluation/ # pytest-fuzz — Python fuzz regression tests covering tests/, backend, and training # terraform — Terraform tests covering infrastructure/terraform/ # vitest — Vitest tests covering data-management/viewer/frontend/src/ @@ -57,6 +58,11 @@ coverage: - pytest-fuzz target: auto threshold: 1% + pytest-evaluation: + flags: + - pytest-evaluation + target: auto + threshold: 1% patch: default: target: auto @@ -89,6 +95,10 @@ coverage: flags: - pytest-fuzz informational: true + pytest-evaluation: + flags: + - pytest-evaluation + informational: true flags: go: @@ -121,6 +131,10 @@ flags: - data-management/viewer/backend/src/ - training/ carryforward: true + pytest-evaluation: + paths: + - evaluation/ + carryforward: true parsers: jacoco: diff --git a/data-management/viewer/backend/Dockerfile b/data-management/viewer/backend/Dockerfile index 1b4c1021..47846b47 100644 --- a/data-management/viewer/backend/Dockerfile +++ b/data-management/viewer/backend/Dockerfile @@ -8,8 +8,26 @@ WORKDIR /app RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \ && rm -rf /var/lib/apt/lists/* -# Install uv -RUN pip install --no-cache-dir uv==0.10.9 +# Install uv (pinned by hash for OSSF Scorecard Pinned-Dependencies; covers all uv 0.10.9 wheels) +RUN pip install --no-cache-dir --require-hashes uv==0.10.9 \ + --hash=sha256:0649f83fa0f44f18627c00b2a9a60e5c3486a34799b2c874f2b3945b76048a67 \ + --hash=sha256:880dd4cffe4bd184e8871ddf4c7d3c3b042e1f16d2682310644aa8d61eaea3e6 \ + --hash=sha256:a7a784254380552398a6baf4149faf5b31a4003275f685c28421cf8197178a08 \ + --hash=sha256:5ea0e8598fa012cfa4480ecad4d112bc70f514157c3cc1555a7611c7b6b1ab0a \ + --hash=sha256:2d6b5367e9bf87eca51c0f2ecda26a1ff931e41409977b4f0a420de2f3e617cf \ + --hash=sha256:bd04e34db27f9a1d5a0871980edc9f910bb11afbc4abca8234d5a363cbe63c04 \ + --hash=sha256:547deb57311fc64e4a6b8336228fca4cb4dcbeabdc6e85f14f7804dcd0bc8cd2 \ + --hash=sha256:e0091b6d0b666640d7407a433860184f77667077b73564e86d49c2a851f073a8 \ + --hash=sha256:81b2286e6fd869e3507971f39d14829c03e2e31caa8ecc6347b0ffacabb95a5b \ + --hash=sha256:7c9d6deb30edbc22123be75479f99fb476613eaf38a8034c0e98bba24a344179 \ + --hash=sha256:24b1ce6d626e06c4582946b6af07b08a032fcccd81fe54c3db3ed2d1c63a97dc \ + --hash=sha256:fa3401780273d96a2960dbeab58452ce1b387ad8c5da25be6221c0188519e21d \ + --hash=sha256:8f94a31832d2b4c565312ea17a71b8dd2f971e5aa570c5b796a27b2c9fcdb163 \ + --hash=sha256:842c39c19d9072f1ad53c71bb4ecd1c9caa311d5de9d19e09a636274a6c95e2e \ + --hash=sha256:ed44047c602449916ba18a8596715ef7edbbd00859f3db9eac010dc62a0edd30 \ + --hash=sha256:af79552276d8bd622048ab2d67ec22120a6af64d83963c46b1482218c27b571f \ + --hash=sha256:47e18a0521d76293d4f60d129f520b18bddf1976b4a47b50f0fcb04fb6a9d40f \ + --hash=sha256:31e76ae92e70fec47c3efab0c8094035ad7a578454482415b496fa39fc4d685c # Copy dependency manifests COPY pyproject.toml uv.lock ./ diff --git a/docs/security/README.md b/docs/security/README.md index da7485c3..7951a55e 100644 --- a/docs/security/README.md +++ b/docs/security/README.md @@ -24,6 +24,7 @@ Security documentation for the Physical AI Toolchain covering threat analysis, d | [Threat Model](threat-model.md) | STRIDE-based threat analysis and remediation roadmap | | [Deployment Security Guide](../operations/security-guide.md) | Security configuration inventory and deployment responsibilities | | [Release Verification](release-verification.md) | Verify release artifact provenance and SBOM attestations | +| [Workflow Permissions](workflow-permissions.md) | GitHub Actions permission scopes and OSSF Scorecard exceptions | | [SECURITY.md](https://github.com/microsoft/physical-ai-toolchain/blob/main/SECURITY.md) | Vulnerability disclosure and reporting process | ## 🔒 Security Posture diff --git a/docs/security/workflow-permissions.md b/docs/security/workflow-permissions.md new file mode 100644 index 00000000..0a7b9c2d --- /dev/null +++ b/docs/security/workflow-permissions.md @@ -0,0 +1,69 @@ +--- +sidebar_position: 4 +title: Workflow Permissions +description: GitHub Actions permission scopes and OSSF Scorecard Token-Permissions exception rationale +author: Microsoft Robotics-AI Team +ms.date: 2026-02-22 +ms.topic: reference +keywords: + - security + - github-actions + - permissions + - ossf-scorecard + - token-permissions +--- + +## 📋 Overview + +All GitHub Actions workflows in this repository follow the [OpenSSF Scorecard Token-Permissions](https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions) principle: + +- Top-level `permissions:` is `contents: read` (read-only by default). +- Write-scoped permissions are declared at the **job level** only when a specific step requires them. +- No workflow grants `permissions: write-all` or omits an explicit top-level `permissions:` block. + +This document enumerates every job-scoped write permission across `.github/workflows/` and records the justification so security auditors and Scorecard reviewers can verify each exception. + +## 🔒 Job-Scoped Write Permissions + +The 15 write permissions below are required by the action or CLI invoked in the corresponding job. Each grant is the minimum scope needed. + +| Workflow | Job | Permission | Rationale | +|-----------------------------------|--------------------------------|---------------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| `check-binary-integrity.yml` | `check-binary-integrity` | `security-events: write` | Required by `github/codeql-action/upload-sarif` to publish binary integrity findings to the Security tab. | +| `codeql-analysis.yml` | `analyze` | `security-events: write` | Required by `github/codeql-action/analyze` to upload CodeQL SARIF results to the Security tab. | +| `dast-zap-scan.yml` | `dast-zap-scan` | `security-events: write` | Required by `github/codeql-action/upload-sarif` to publish ZAP DAST findings to the Security tab. | +| `dependency-pinning-scan.yml` | `dependency-pinning-scan` | `security-events: write` | Required by `github/codeql-action/upload-sarif` to publish SHA-pinning findings to the Security tab. | +| `gitleaks-scan.yml` | `scan` | `security-events: write` | Required by `github/codeql-action/upload-sarif` to publish secret-scanning findings to the Security tab. | +| `main.yml` | `dependency-pinning` | `security-events: write` | Inherited by reusable `dependency-pinning-scan.yml`; required for SARIF upload. | +| `main.yml` | `codeql-analysis` | `security-events: write` | Inherited by reusable `codeql-analysis.yml`; required for SARIF upload. | +| `main.yml` | `generate-dependency-sbom` | `contents: write` | Required by `gh release upload "${TAG}" dependencies.spdx.json --clobber` to attach the dependency SBOM to the release. | +| `main.yml` | `attest-release` | `attestations: write` | Required by `actions/attest-build-provenance` and `actions/attest` to create Sigstore provenance attestations. | +| `main.yml` | `attest-release` | `contents: write` | Required by `gh release upload` to attach `*.sigstore.json` and `*.intoto.jsonl` attestation artifacts to the release. | +| `main.yml` | `sbom-diff` | `contents: write` | Required by `gh release upload "${TAG}" dependency-diff.md --clobber` to attach the dependency-change report to the release. | +| `main.yml` | `append-verification-notes` | `contents: write` | Required by `gh release edit` to append artifact-verification instructions to the release body. | +| `pr-validation.yml` | `dependency-pinning` | `security-events: write` | Inherited by reusable `dependency-pinning-scan.yml`; required for SARIF upload. | +| `pr-validation.yml` | `codeql-analysis` | `security-events: write` | Inherited by reusable `codeql-analysis.yml`; required for SARIF upload. | +| `scorecard.yml` | `analysis` | `security-events: write` | Required by `github/codeql-action/upload-sarif` to publish OpenSSF Scorecard findings to the Security tab. | + +## 🛡️ Defense in Depth + +The release-publishing path uses additional hardening beyond minimum permissions: + +- All actions are SHA-pinned (no floating tags). +- `persist-credentials: false` on every `actions/checkout` invocation. +- `id-token: write` is granted only to jobs that mint Sigstore OIDC tokens; the token is never exposed to user-controlled steps. +- Release-gated jobs (`generate-dependency-sbom`, `attest-release`, `sbom-diff`, `append-verification-notes`) run only when `release-please` produces a release (`needs.release-please.outputs.release_created == 'true'`). + +## 🔗 Related Resources + +- [OpenSSF Scorecard Token-Permissions check](https://github.com/ossf/scorecard/blob/main/docs/checks.md#token-permissions) +- [GitHub Actions: Assigning permissions to jobs](https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs) +- [Release Verification](release-verification.md) +- [Threat Model](threat-model.md) + + + + +*🤖 Crafted with precision by ✨Copilot following brilliant human instruction, +then carefully refined by our team of discerning human reviewers.* + diff --git a/evaluation/metrics/bootstrap_mlflow.py b/evaluation/metrics/bootstrap_mlflow.py index 5e19bab8..2374cbe6 100644 --- a/evaluation/metrics/bootstrap_mlflow.py +++ b/evaluation/metrics/bootstrap_mlflow.py @@ -2,6 +2,7 @@ import os import sys +from pathlib import Path import mlflow from azure.ai.ml import MLClient @@ -29,7 +30,8 @@ experiment_name = f"lerobot-{os.environ.get('POLICY_TYPE', 'act')}-inference" mlflow.set_experiment(experiment_name) -with open("/tmp/mlflow_config.env", "w") as f: +config_path = Path(os.environ.get("MLFLOW_CONFIG_PATH", "/tmp/mlflow_config.env")) +with config_path.open("w") as f: f.write(f"MLFLOW_TRACKING_URI={tracking_uri}\n") f.write(f"MLFLOW_EXPERIMENT_NAME={experiment_name}\n") diff --git a/evaluation/metrics/upload_artifacts.py b/evaluation/metrics/upload_artifacts.py index a2c9c00a..0f00d982 100644 --- a/evaluation/metrics/upload_artifacts.py +++ b/evaluation/metrics/upload_artifacts.py @@ -7,7 +7,7 @@ import json import os import traceback -from datetime import datetime +from datetime import UTC, datetime from pathlib import Path from urllib.parse import urlparse @@ -286,7 +286,7 @@ def main() -> None: blob_container = os.environ.get("BLOB_CONTAINER", "") onnx_success = os.environ["ONNX_SUCCESS"] == "1" jit_success = os.environ["JIT_SUCCESS"] == "1" - timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") onnx_metrics, jit_metrics = load_metrics(metrics_dir) diff --git a/evaluation/pyproject.toml b/evaluation/pyproject.toml index 2f9ea0dc..b7e0b6d8 100644 --- a/evaluation/pyproject.toml +++ b/evaluation/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "azure-ai-ml==1.32.0", "marshmallow==3.26.2", "mlflow==3.11.1", - "packaging==26.1", + "packaging==25.0", "psutil==7.2.2", "pynvml==13.0.1", "pyperclip==1.11.0", @@ -31,3 +31,55 @@ build-backend = "hatchling.build" [tool.uv] package = false + +[dependency-groups] +dev = [ + "pytest==9.0.3", + "pytest-mock==3.15.1", + "pytest-cov==7.1.0", + "hypothesis==6.151.13", + "matplotlib==3.10.8", + "numpy==2.2.6", + "torch==2.10.0", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["."] +addopts = [ + "-ra", + "--strict-markers", + "--strict-config", + "--cov=sil", + "--cov=metrics", + "--cov-report=term-missing", + "--cov-report=xml:logs/coverage.xml", + "--junitxml=logs/pytest-results.xml", +] +markers = [ + "e2e: marks tests requiring Isaac Sim or GPU hardware (deselect: -m 'not e2e')", +] + +[tool.coverage.run] +source = ["sil", "metrics"] +branch = true +omit = [ + "**/conftest.py", + "**/__init__.py", + "sil/monitor_checkpoints.py", + "sil/play.py", + "sil/play_policy.py", +] + +[tool.coverage.report] +show_missing = true +precision = 2 +exclude_lines = [ + "pragma: no cover", + "if __name__ == .__main__.", + "if TYPE_CHECKING:", + "raise NotImplementedError", +] + +[tool.coverage.xml] +output = "logs/coverage.xml" diff --git a/evaluation/sil/docker/Dockerfile.lerobot-eval b/evaluation/sil/docker/Dockerfile.lerobot-eval index fb468076..48daf584 100644 --- a/evaluation/sil/docker/Dockerfile.lerobot-eval +++ b/evaluation/sil/docker/Dockerfile.lerobot-eval @@ -5,7 +5,26 @@ RUN apt-get update -qq && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY requirements-lerobot-eval.txt . -RUN pip install --no-cache-dir "uv==0.10.9" && \ +# Install uv (pinned by hash for OSSF Scorecard Pinned-Dependencies; covers all uv 0.10.9 wheels) +RUN pip install --no-cache-dir --require-hashes "uv==0.10.9" \ + --hash=sha256:0649f83fa0f44f18627c00b2a9a60e5c3486a34799b2c874f2b3945b76048a67 \ + --hash=sha256:880dd4cffe4bd184e8871ddf4c7d3c3b042e1f16d2682310644aa8d61eaea3e6 \ + --hash=sha256:a7a784254380552398a6baf4149faf5b31a4003275f685c28421cf8197178a08 \ + --hash=sha256:5ea0e8598fa012cfa4480ecad4d112bc70f514157c3cc1555a7611c7b6b1ab0a \ + --hash=sha256:2d6b5367e9bf87eca51c0f2ecda26a1ff931e41409977b4f0a420de2f3e617cf \ + --hash=sha256:bd04e34db27f9a1d5a0871980edc9f910bb11afbc4abca8234d5a363cbe63c04 \ + --hash=sha256:547deb57311fc64e4a6b8336228fca4cb4dcbeabdc6e85f14f7804dcd0bc8cd2 \ + --hash=sha256:e0091b6d0b666640d7407a433860184f77667077b73564e86d49c2a851f073a8 \ + --hash=sha256:81b2286e6fd869e3507971f39d14829c03e2e31caa8ecc6347b0ffacabb95a5b \ + --hash=sha256:7c9d6deb30edbc22123be75479f99fb476613eaf38a8034c0e98bba24a344179 \ + --hash=sha256:24b1ce6d626e06c4582946b6af07b08a032fcccd81fe54c3db3ed2d1c63a97dc \ + --hash=sha256:fa3401780273d96a2960dbeab58452ce1b387ad8c5da25be6221c0188519e21d \ + --hash=sha256:8f94a31832d2b4c565312ea17a71b8dd2f971e5aa570c5b796a27b2c9fcdb163 \ + --hash=sha256:842c39c19d9072f1ad53c71bb4ecd1c9caa311d5de9d19e09a636274a6c95e2e \ + --hash=sha256:ed44047c602449916ba18a8596715ef7edbbd00859f3db9eac010dc62a0edd30 \ + --hash=sha256:af79552276d8bd622048ab2d67ec22120a6af64d83963c46b1482218c27b571f \ + --hash=sha256:47e18a0521d76293d4f60d129f520b18bddf1976b4a47b50f0fcb04fb6a9d40f \ + --hash=sha256:31e76ae92e70fec47c3efab0c8094035ad7a578454482415b496fa39fc4d685c && \ uv pip install -r requirements-lerobot-eval.txt --system # Pre-download ResNet18 backbone weights (required by ACT policy, no internet at runtime) diff --git a/evaluation/sil/scripts/download_aml_model.py b/evaluation/sil/scripts/download_aml_model.py index 334bc084..2834e14d 100644 --- a/evaluation/sil/scripts/download_aml_model.py +++ b/evaluation/sil/scripts/download_aml_model.py @@ -16,7 +16,7 @@ model_name = os.environ["AML_MODEL_NAME"] model_version = os.environ["AML_MODEL_VERSION"] -download_dir = Path("/tmp/aml-model") +download_dir = Path(os.environ.get("AML_DOWNLOAD_DIR", "/tmp/aml-model")) download_dir.mkdir(parents=True, exist_ok=True) print(f"Downloading {model_name}:{model_version}...") @@ -31,7 +31,8 @@ model_path = candidate break -with open("/tmp/aml_model_path.env", "w") as f: +config_path = Path(os.environ.get("AML_CONFIG_PATH", "/tmp/aml_model_path.env")) +with config_path.open("w") as f: f.write(f"AML_MODEL_PATH={model_path}\n") print(f"Model downloaded to: {model_path}") diff --git a/evaluation/sil/scripts/download_blob_dataset.py b/evaluation/sil/scripts/download_blob_dataset.py index 1eb4d372..63048517 100644 --- a/evaluation/sil/scripts/download_blob_dataset.py +++ b/evaluation/sil/scripts/download_blob_dataset.py @@ -9,7 +9,8 @@ account = os.environ["BLOB_STORAGE_ACCOUNT"] container = os.environ.get("BLOB_STORAGE_CONTAINER", "datasets") prefix = os.environ["BLOB_PREFIX"] -local_root = Path("/workspace/data") / prefix.replace("/", "_") +data_root = Path(os.environ.get("DATA_ROOT", "/workspace/data")) +local_root = data_root / prefix.replace("/", "_") local_root.mkdir(parents=True, exist_ok=True) credential = DefaultAzureCredential() @@ -28,7 +29,8 @@ with open(local_path, "wb") as f: f.write(client.download_blob(blob.name).readall()) -with open("/tmp/dataset_path.env", "w") as f: +config_path = Path(os.environ.get("DATASET_CONFIG_PATH", "/tmp/dataset_path.env")) +with config_path.open("w") as f: f.write(f"DATASET_DIR={local_root}\n") print(f"Dataset downloaded to: {local_root}") diff --git a/evaluation/tests/conftest.py b/evaluation/tests/conftest.py new file mode 100644 index 00000000..d67ebd6b --- /dev/null +++ b/evaluation/tests/conftest.py @@ -0,0 +1,77 @@ +"""Shared fixtures for evaluation tests.""" + +from __future__ import annotations + +import sys +from unittest.mock import MagicMock + +# Stub the cross-package training.rl.simulation_shutdown import used by sil.policy_evaluation +# so test collection does not require the training package on PYTHONPATH. +if "training" not in sys.modules: + _training = MagicMock() + _training.rl = MagicMock() + _training.rl.simulation_shutdown = MagicMock() + _training.rl.simulation_shutdown.prepare_for_shutdown = MagicMock() + sys.modules["training"] = _training + sys.modules["training.rl"] = _training.rl + sys.modules["training.rl.simulation_shutdown"] = _training.rl.simulation_shutdown + +import numpy as np +import pytest +from sil.robot_types import IMAGE_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_JOINTS + + +@pytest.fixture +def rng() -> np.random.Generator: + """Seeded random generator for reproducible tests.""" + return np.random.default_rng(42) + + +@pytest.fixture +def joint_positions() -> np.ndarray: + """Valid joint position array of shape ``(NUM_JOINTS,)``.""" + return np.zeros(NUM_JOINTS, dtype=np.float64) + + +@pytest.fixture +def random_joint_positions(rng: np.random.Generator) -> np.ndarray: + """Random joint positions in ``[-pi, pi]``.""" + return rng.uniform(-np.pi, np.pi, size=(NUM_JOINTS,)) + + +@pytest.fixture +def color_image() -> np.ndarray: + """Valid color image array of shape ``(IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)``.""" + return np.zeros((IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), dtype=np.uint8) + + +@pytest.fixture +def action_arrays(rng: np.random.Generator) -> tuple[np.ndarray, np.ndarray]: + """Predicted and ground truth action delta arrays of shape ``(100, NUM_JOINTS)``.""" + predicted = rng.normal(0, 0.01, size=(100, NUM_JOINTS)) + ground_truth = rng.normal(0, 0.01, size=(100, NUM_JOINTS)) + return predicted, ground_truth + + +@pytest.fixture +def inference_times(rng: np.random.Generator) -> np.ndarray: + """Per-step inference times in seconds, shape ``(100,)``.""" + return rng.uniform(0.001, 0.01, size=(100,)) + + +@pytest.fixture +def mock_azure_ml(monkeypatch: pytest.MonkeyPatch) -> tuple[MagicMock, MagicMock]: + """Inject mock Azure ML and Identity modules into ``sys.modules`` and set env vars.""" + mock_ml = MagicMock() + mock_identity = MagicMock() + + for mod in ("azure", "azure.ai"): + monkeypatch.setitem(sys.modules, mod, MagicMock()) + monkeypatch.setitem(sys.modules, "azure.ai.ml", mock_ml) + monkeypatch.setitem(sys.modules, "azure.identity", mock_identity) + + monkeypatch.setenv("AZURE_SUBSCRIPTION_ID", "test-sub-id") + monkeypatch.setenv("AZURE_RESOURCE_GROUP", "test-rg") + monkeypatch.setenv("AZUREML_WORKSPACE_NAME", "test-ws") + + return mock_ml, mock_identity diff --git a/evaluation/tests/test_batch_lerobot_eval.py b/evaluation/tests/test_batch_lerobot_eval.py new file mode 100644 index 00000000..ac45ef61 --- /dev/null +++ b/evaluation/tests/test_batch_lerobot_eval.py @@ -0,0 +1,314 @@ +"""Unit tests for ``sil/scripts/batch-lerobot-eval.py``.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +import types +from pathlib import Path +from unittest.mock import MagicMock + +import numpy as np +import pytest + +# The script imports ``inference.plotting`` at module level. Provide a +# lightweight stub so the module can be loaded without the full +# inference package. +_inference = types.ModuleType("inference") +_inference_plotting = types.ModuleType("inference.plotting") +for _name in ( + "plot_action_deltas", + "plot_aggregate_summary", + "plot_cumulative_positions", + "plot_error_heatmap", + "plot_summary_panel", +): + setattr(_inference_plotting, _name, lambda *a, **kw: None) +_inference.plotting = _inference_plotting # type: ignore[attr-defined] +sys.modules.setdefault("inference", _inference) +sys.modules.setdefault("inference.plotting", _inference_plotting) + +_SCRIPT = Path(__file__).resolve().parents[1] / "sil" / "scripts" / "batch-lerobot-eval.py" +_spec = importlib.util.spec_from_file_location("batch_lerobot_eval", _SCRIPT) +assert _spec and _spec.loader +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) + +parse_episode_range = _mod.parse_episode_range +run_inference = _mod.run_inference +plot_episode = _mod.plot_episode + + +class TestParseEpisodeRange: + def test_single_number(self) -> None: + assert parse_episode_range("5") == [5] + + def test_comma_separated(self) -> None: + assert parse_episode_range("0,2,5") == [0, 2, 5] + + def test_range(self) -> None: + assert parse_episode_range("1-3") == [1, 2, 3] + + def test_mixed_range_and_numbers(self) -> None: + assert parse_episode_range("0,2,5-7") == [0, 2, 5, 6, 7] + + def test_deduplication(self) -> None: + assert parse_episode_range("1,1,2") == [1, 2] + + def test_overlapping_range_and_number(self) -> None: + assert parse_episode_range("3,1-5") == [1, 2, 3, 4, 5] + + def test_single_element_range(self) -> None: + assert parse_episode_range("4-4") == [4] + + def test_result_is_sorted(self) -> None: + assert parse_episode_range("9,1,5") == [1, 5, 9] + + def test_whitespace_handling(self) -> None: + assert parse_episode_range(" 1 , 3 , 5 ") == [1, 3, 5] + + def test_invalid_value_raises(self) -> None: + with pytest.raises(ValueError): + parse_episode_range("abc") + + +class TestRunInference: + def test_cached_predictions_returned(self, tmp_path: Path) -> None: + out_path = tmp_path / "ep001_predictions.npz" + out_path.write_bytes(b"cached") + result = run_inference(1, "repo", "dataset", "cpu", tmp_path) + assert result == out_path + + def test_successful_inference_returns_path(self, tmp_path: Path, monkeypatch) -> None: + mock_result = MagicMock(returncode=0, stdout="MSE: 0.01\n", stderr="") + monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result) + result = run_inference(1, "repo", "dataset", "cpu", tmp_path) + assert result == tmp_path / "ep001_predictions.npz" + + def test_failed_inference_returns_none(self, tmp_path: Path, monkeypatch) -> None: + mock_result = MagicMock(returncode=1, stdout="", stderr="Error occurred") + monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result) + result = run_inference(1, "repo", "dataset", "cpu", tmp_path) + assert result is None + + def test_metric_lines_printed(self, tmp_path: Path, monkeypatch, capsys) -> None: + mock_result = MagicMock(returncode=0, stdout="Loading model...\nMSE: 0.01\nMAE: 0.05\nDone.", stderr="") + monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result) + run_inference(1, "repo", "dataset", "cpu", tmp_path) + captured = capsys.readouterr() + assert "MSE: 0.01" in captured.out + assert "MAE: 0.05" in captured.out + + +class TestPlotEpisode: + @staticmethod + def _stub_plotting(monkeypatch) -> MagicMock: + mock_fig = MagicMock() + for name in ( + "plot_action_deltas", + "plot_cumulative_positions", + "plot_error_heatmap", + "plot_summary_panel", + ): + monkeypatch.setattr(_mod, name, lambda *a, _f=mock_fig, **kw: _f) + monkeypatch.setattr(_mod.plt, "close", lambda *a: None) + return mock_fig + + def test_returns_metrics_dict(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + pred = np.array([[1.0, 2.0], [3.0, 4.0]]) + gt = np.array([[1.1, 2.1], [3.1, 4.1]]) + inf_times = np.array([0.01, 0.02]) + npz_path = tmp_path / "predictions.npz" + np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times) + + metrics = plot_episode(1, npz_path, tmp_path, fps=30.0, dpi=150) + + assert metrics is not None + assert metrics["episode"] == 1 + assert metrics["steps"] == 2 + assert "mse" in metrics + assert "mae" in metrics + assert "per_joint_mae" in metrics + assert "avg_inference_ms" in metrics + assert "throughput_hz" in metrics + + def test_creates_episode_directory(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + pred = np.array([[1.0]]) + gt = np.array([[1.1]]) + inf_times = np.array([0.01]) + npz_path = tmp_path / "predictions.npz" + np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times) + + plot_episode(5, npz_path, tmp_path, fps=30.0, dpi=150) + assert (tmp_path / "episode_005").is_dir() + + def test_metric_values(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + pred = np.array([[0.0, 0.0]]) + gt = np.array([[1.0, 0.0]]) + inf_times = np.array([0.01]) + npz_path = tmp_path / "predictions.npz" + np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times) + + metrics = plot_episode(1, npz_path, tmp_path, fps=30.0, dpi=150) + + assert metrics["mse"] == pytest.approx(0.5) + assert metrics["mae"] == pytest.approx(0.5) + assert metrics["avg_inference_ms"] == pytest.approx(10.0) + assert metrics["throughput_hz"] == pytest.approx(100.0) + + def test_zero_inference_time_throughput(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + pred = np.array([[1.0]]) + gt = np.array([[1.0]]) + inf_times = np.array([0.0]) + npz_path = tmp_path / "predictions.npz" + np.savez(npz_path, predicted=pred, ground_truth=gt, inference_times=inf_times) + + metrics = plot_episode(1, npz_path, tmp_path, fps=30.0, dpi=150) + assert metrics["throughput_hz"] == 0.0 + + +class TestMain: + @staticmethod + def _stub_plotting(monkeypatch) -> MagicMock: + mock_fig = MagicMock() + for name in ( + "plot_action_deltas", + "plot_cumulative_positions", + "plot_error_heatmap", + "plot_summary_panel", + "plot_aggregate_summary", + ): + monkeypatch.setattr(_mod, name, lambda *a, _f=mock_fig, **kw: _f) + monkeypatch.setattr(_mod.plt, "close", lambda *a: None) + return mock_fig + + def test_plot_only_generates_metrics_json(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + pred = np.array([[1.0, 2.0]]) + gt = np.array([[1.1, 2.1]]) + inf_times = np.array([0.01]) + for ep in (1, 2): + np.savez( + tmp_path / f"ep{ep:03d}_predictions.npz", + predicted=pred, + ground_truth=gt, + inference_times=inf_times, + ) + + out_dir = tmp_path / "output" + monkeypatch.setattr( + sys, + "argv", + [ + "batch-lerobot-eval", + "--plot-only", + "--npz-dir", + str(tmp_path), + "--episodes", + "1-2", + "--output-dir", + str(out_dir), + ], + ) + _mod.main() + + metrics_path = out_dir / "eval_metrics.json" + assert metrics_path.exists() + metrics = json.loads(metrics_path.read_text()) + assert len(metrics) == 2 + + def test_missing_npz_skipped_in_plot_only(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + out_dir = tmp_path / "output" + monkeypatch.setattr( + sys, + "argv", + [ + "batch-lerobot-eval", + "--plot-only", + "--npz-dir", + str(tmp_path), + "--episodes", + "1", + "--output-dir", + str(out_dir), + ], + ) + _mod.main() + assert not (out_dir / "eval_metrics.json").exists() + + def test_requires_policy_and_dataset_without_plot_only(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setattr( + sys, + "argv", + [ + "batch-lerobot-eval", + "--episodes", + "1", + "--output-dir", + str(tmp_path), + ], + ) + with pytest.raises(SystemExit): + _mod.main() + + def test_plot_episode_returning_none_continues_loop(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + monkeypatch.setattr(_mod, "plot_episode", lambda *a, **kw: None) + pred = np.array([[1.0, 2.0]]) + gt = np.array([[1.1, 2.1]]) + inf_times = np.array([0.01]) + for ep in (1, 2): + np.savez( + tmp_path / f"ep{ep:03d}_predictions.npz", + predicted=pred, + ground_truth=gt, + inference_times=inf_times, + ) + + out_dir = tmp_path / "output" + monkeypatch.setattr( + sys, + "argv", + [ + "batch-lerobot-eval", + "--plot-only", + "--npz-dir", + str(tmp_path), + "--episodes", + "1-2", + "--output-dir", + str(out_dir), + ], + ) + _mod.main() + assert not (out_dir / "eval_metrics.json").exists() + + def test_inference_failure_skips_plotting(self, tmp_path: Path, monkeypatch) -> None: + self._stub_plotting(monkeypatch) + mock_result = MagicMock(returncode=1, stdout="", stderr="Error") + monkeypatch.setattr(_mod.subprocess, "run", lambda *a, **kw: mock_result) + + out_dir = tmp_path / "output" + monkeypatch.setattr( + sys, + "argv", + [ + "batch-lerobot-eval", + "--policy-repo", + "repo", + "--dataset-dir", + str(tmp_path), + "--episodes", + "1-2", + "--output-dir", + str(out_dir), + ], + ) + _mod.main() + assert not (out_dir / "eval_metrics.json").exists() diff --git a/evaluation/tests/test_bootstrap_mlflow.py b/evaluation/tests/test_bootstrap_mlflow.py new file mode 100644 index 00000000..c0969087 --- /dev/null +++ b/evaluation/tests/test_bootstrap_mlflow.py @@ -0,0 +1,75 @@ +"""Unit tests for ``metrics.bootstrap_mlflow`` module-level script.""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +_EVAL_ROOT = Path(__file__).resolve().parent.parent +_SCRIPT_PATH = _EVAL_ROOT / "metrics" / "bootstrap_mlflow.py" + + +class TestBootstrapMlflow: + """Execute the bootstrap script via importlib with mocked Azure and MLflow.""" + + @pytest.fixture(autouse=True) + def _setup( + self, + monkeypatch: pytest.MonkeyPatch, + mock_azure_ml: tuple[MagicMock, MagicMock], + tmp_path: Path, + ) -> None: + mock_ml, _ = mock_azure_ml + self.mock_mlflow = MagicMock() + monkeypatch.setitem(sys.modules, "mlflow", self.mock_mlflow) + + self.mock_workspace = MagicMock() + self.mock_workspace.mlflow_tracking_uri = "azureml://test-tracking" + mock_ml.MLClient.return_value.workspaces.get.return_value = self.mock_workspace + + self.config_path = tmp_path / "mlflow_config.env" + monkeypatch.setenv("MLFLOW_CONFIG_PATH", str(self.config_path)) + + def _load_script(self) -> None: + spec = importlib.util.spec_from_file_location("bootstrap_mlflow", _SCRIPT_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + def test_writes_config_with_tracking_uri(self) -> None: + self._load_script() + content = self.config_path.read_text() + assert "MLFLOW_TRACKING_URI=azureml://test-tracking\n" in content + + def test_default_experiment_name_uses_policy_type( + self, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setenv("POLICY_TYPE", "diffusion") + self._load_script() + content = self.config_path.read_text() + assert "MLFLOW_EXPERIMENT_NAME=lerobot-diffusion-inference\n" in content + + def test_custom_experiment_name(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("EXPERIMENT_NAME", "my-experiment") + self._load_script() + content = self.config_path.read_text() + assert "MLFLOW_EXPERIMENT_NAME=my-experiment\n" in content + + def test_none_experiment_falls_back_to_default( + self, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setenv("EXPERIMENT_NAME", "none") + self._load_script() + content = self.config_path.read_text() + assert "MLFLOW_EXPERIMENT_NAME=lerobot-act-inference\n" in content + + def test_missing_tracking_uri_exits(self) -> None: + self.mock_workspace.mlflow_tracking_uri = None + with pytest.raises(SystemExit) as exc_info: + self._load_script() + assert exc_info.value.code == 1 diff --git a/evaluation/tests/test_download_aml_model.py b/evaluation/tests/test_download_aml_model.py new file mode 100644 index 00000000..c8c6b358 --- /dev/null +++ b/evaluation/tests/test_download_aml_model.py @@ -0,0 +1,95 @@ +"""Unit tests for ``sil.scripts.download_aml_model`` module-level script.""" + +from __future__ import annotations + +import importlib.util +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +_EVAL_ROOT = Path(__file__).resolve().parent.parent +_SCRIPT_PATH = _EVAL_ROOT / "sil" / "scripts" / "download_aml_model.py" + + +class TestDownloadAmlModel: + """Execute the download script via importlib with mocked Azure SDK.""" + + @pytest.fixture(autouse=True) + def _setup( + self, + monkeypatch: pytest.MonkeyPatch, + mock_azure_ml: tuple[MagicMock, MagicMock], + tmp_path: Path, + ) -> None: + mock_ml, _ = mock_azure_ml + self.mock_client = mock_ml.MLClient.return_value + + monkeypatch.setenv("AML_MODEL_NAME", "test-model") + monkeypatch.setenv("AML_MODEL_VERSION", "3") + + self.download_dir = tmp_path / "aml-model" + self.config_path = tmp_path / "aml_model_path.env" + monkeypatch.setenv("AML_DOWNLOAD_DIR", str(self.download_dir)) + monkeypatch.setenv("AML_CONFIG_PATH", str(self.config_path)) + + def _load_script(self) -> None: + spec = importlib.util.spec_from_file_location("download_aml_model", _SCRIPT_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + def test_calls_download_with_model_info(self) -> None: + model_dir = self.download_dir / "test-model" + model_dir.mkdir(parents=True, exist_ok=True) + (model_dir / "weights.safetensors").write_bytes(b"\x00" * 64) + + self._load_script() + + self.mock_client.models.download.assert_called_once_with( + name="test-model", + version="3", + download_path=str(self.download_dir), + ) + + def test_writes_config_env(self) -> None: + model_dir = self.download_dir / "test-model" + model_dir.mkdir(parents=True, exist_ok=True) + (model_dir / "weights.safetensors").write_bytes(b"\x00" * 64) + + self._load_script() + + content = self.config_path.read_text() + assert content.startswith("AML_MODEL_PATH=") + + def test_finds_safetensors_directory(self) -> None: + model_dir = self.download_dir / "test-model" + model_dir.mkdir(parents=True, exist_ok=True) + sub = model_dir / "checkpoint" + sub.mkdir() + (sub / "model.safetensors").write_bytes(b"\x00" * 64) + + self._load_script() + + content = self.config_path.read_text() + assert "checkpoint" in content + + def test_finds_bin_directory(self) -> None: + model_dir = self.download_dir / "test-model" + model_dir.mkdir(parents=True, exist_ok=True) + sub = model_dir / "ckpt" + sub.mkdir() + (sub / "pytorch_model.bin").write_bytes(b"\x00" * 64) + + self._load_script() + + content = self.config_path.read_text() + assert "ckpt" in content + + def test_falls_back_to_download_dir_when_no_model_name_dir(self) -> None: + self.download_dir.mkdir(parents=True, exist_ok=True) + (self.download_dir / "random_file.txt").write_bytes(b"\x00" * 64) + + self._load_script() + + content = self.config_path.read_text() + assert f"AML_MODEL_PATH={self.download_dir}" in content diff --git a/evaluation/tests/test_download_blob_dataset.py b/evaluation/tests/test_download_blob_dataset.py new file mode 100644 index 00000000..39425df3 --- /dev/null +++ b/evaluation/tests/test_download_blob_dataset.py @@ -0,0 +1,88 @@ +"""Unit tests for ``sil.scripts.download_blob_dataset`` module-level script.""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +_EVAL_ROOT = Path(__file__).resolve().parent.parent +_SCRIPT_PATH = _EVAL_ROOT / "sil" / "scripts" / "download_blob_dataset.py" + + +class TestDownloadBlobDataset: + """Execute the download script with mocked Azure SDK and redirected output paths.""" + + @pytest.fixture(autouse=True) + def _setup(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + # Mock azure.identity and azure.storage.blob via sys.modules. + mock_identity = MagicMock() + mock_blob = MagicMock() + + blob_a = MagicMock() + blob_a.name = "myprefix/sub/file_a.bin" + blob_b = MagicMock() + blob_b.name = "myprefix/file_b.txt" + # Empty rel-path entry should be skipped. + blob_skip = MagicMock() + blob_skip.name = "myprefix/" + + self.client = MagicMock() + self.client.list_blobs.return_value = [blob_a, blob_b, blob_skip] + download_stream = MagicMock() + download_stream.readall.return_value = b"data-bytes" + self.client.download_blob.return_value = download_stream + + mock_blob.ContainerClient.from_container_url.return_value = self.client + self.mock_blob = mock_blob + self.mock_identity = mock_identity + + monkeypatch.setitem(sys.modules, "azure", MagicMock()) + monkeypatch.setitem(sys.modules, "azure.identity", mock_identity) + monkeypatch.setitem(sys.modules, "azure.storage", MagicMock()) + monkeypatch.setitem(sys.modules, "azure.storage.blob", mock_blob) + + monkeypatch.setenv("BLOB_STORAGE_ACCOUNT", "myacct") + monkeypatch.setenv("BLOB_PREFIX", "myprefix") + monkeypatch.delenv("BLOB_STORAGE_CONTAINER", raising=False) + + self.data_root = tmp_path / "workspace_data" + self.config_path = tmp_path / "dataset_path.env" + self.local_root = self.data_root / "myprefix" + monkeypatch.setenv("DATA_ROOT", str(self.data_root)) + monkeypatch.setenv("DATASET_CONFIG_PATH", str(self.config_path)) + + def _run(self) -> None: + spec = importlib.util.spec_from_file_location("download_blob_dataset", _SCRIPT_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + def test_default_container_used(self) -> None: + self._run() + url = self.mock_blob.ContainerClient.from_container_url.call_args[0][0] + assert url == "https://myacct.blob.core.windows.net/datasets" + + def test_custom_container_env(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("BLOB_STORAGE_CONTAINER", "custom-ctr") + self._run() + url = self.mock_blob.ContainerClient.from_container_url.call_args[0][0] + assert url.endswith("/custom-ctr") + + def test_writes_files_and_skips_empty_rel(self) -> None: + self._run() + assert (self.local_root / "sub" / "file_a.bin").read_bytes() == b"data-bytes" + assert (self.local_root / "file_b.txt").read_bytes() == b"data-bytes" + downloaded = [c.args[0] for c in self.client.download_blob.call_args_list] + assert "myprefix/" not in downloaded + + def test_writes_config_env(self) -> None: + self._run() + content = self.config_path.read_text() + assert content == f"DATASET_DIR={self.local_root}\n" + + def test_uses_default_credential(self) -> None: + self._run() + self.mock_identity.DefaultAzureCredential.assert_called_once() diff --git a/evaluation/tests/test_plot_lerobot_trajectories.py b/evaluation/tests/test_plot_lerobot_trajectories.py new file mode 100644 index 00000000..61a9839e --- /dev/null +++ b/evaluation/tests/test_plot_lerobot_trajectories.py @@ -0,0 +1,112 @@ +"""Tests for metrics/plot-lerobot-trajectories.py.""" + +from __future__ import annotations + +import importlib.util +import sys +import types +from pathlib import Path +from unittest.mock import MagicMock + +import numpy as np +import pytest + +_REPO_ROOT = Path(__file__).resolve().parents[1] +_SCRIPT_PATH = _REPO_ROOT / "metrics" / "plot-lerobot-trajectories.py" + +_inference_pkg = types.ModuleType("inference") +_inference_pkg.__path__ = [] # type: ignore[attr-defined] +_plotting_mod = types.ModuleType("inference.plotting") +for _name in ( + "plot_action_deltas", + "plot_cumulative_positions", + "plot_error_heatmap", + "plot_summary_panel", +): + setattr(_plotting_mod, _name, lambda *a, **kw: None) +sys.modules.setdefault("inference", _inference_pkg) +sys.modules["inference.plotting"] = _plotting_mod + +_spec = importlib.util.spec_from_file_location("plot_lerobot_trajectories", _SCRIPT_PATH) +assert _spec is not None and _spec.loader is not None +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) + + +def _stub_plotting(monkeypatch) -> MagicMock: + mock_fig = MagicMock() + for name in ( + "plot_action_deltas", + "plot_cumulative_positions", + "plot_error_heatmap", + "plot_summary_panel", + ): + monkeypatch.setattr(_mod, name, lambda *a, _f=mock_fig, **kw: _f) + monkeypatch.setattr(_mod.plt, "close", lambda *a: None) + return mock_fig + + +def _write_npz(path: Path) -> None: + np.savez( + path, + predicted=np.array([[1.0, 2.0], [3.0, 4.0]]), + ground_truth=np.array([[1.1, 2.1], [3.1, 4.1]]), + inference_times=np.array([0.01, 0.02]), + ) + + +class TestMain: + def test_missing_file_exits(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setattr( + sys, + "argv", + [ + "plot-lerobot-trajectories", + str(tmp_path / "nope.npz"), + ], + ) + with pytest.raises(SystemExit) as exc_info: + _mod.main() + assert exc_info.value.code == 1 + + def test_default_output_dir_creates_sibling(self, tmp_path: Path, monkeypatch) -> None: + _stub_plotting(monkeypatch) + npz_path = tmp_path / "predictions.npz" + _write_npz(npz_path) + + monkeypatch.setattr( + sys, + "argv", + [ + "plot-lerobot-trajectories", + str(npz_path), + ], + ) + _mod.main() + assert (tmp_path / "trajectory_plots").is_dir() + + def test_custom_output_dir_used(self, tmp_path: Path, monkeypatch) -> None: + mock_fig = _stub_plotting(monkeypatch) + npz_path = tmp_path / "predictions.npz" + _write_npz(npz_path) + out_dir = tmp_path / "custom_out" + + monkeypatch.setattr( + sys, + "argv", + [ + "plot-lerobot-trajectories", + str(npz_path), + "--output-dir", + str(out_dir), + "--episode", + "5", + "--fps", + "60.0", + "--dpi", + "200", + ], + ) + _mod.main() + assert out_dir.is_dir() + assert mock_fig.savefig.call_count == 4 diff --git a/evaluation/tests/test_plotting.py b/evaluation/tests/test_plotting.py new file mode 100644 index 00000000..8ca04480 --- /dev/null +++ b/evaluation/tests/test_plotting.py @@ -0,0 +1,147 @@ +"""Unit tests for ``metrics.plotting``.""" + +from __future__ import annotations + +import matplotlib.pyplot as plt +import numpy as np +import pytest +from metrics.plotting import ( + JOINT_NAMES, + plot_action_deltas, + plot_aggregate_summary, + plot_cumulative_positions, + plot_error_heatmap, + plot_summary_panel, +) +from sil.robot_types import NUM_JOINTS + + +@pytest.fixture(autouse=True) +def _close_figures(): + """Close all matplotlib figures after each test to prevent memory leaks.""" + yield + plt.close("all") + + +class TestPlotActionDeltas: + def test_returns_figure(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + fig = plot_action_deltas(predicted, ground_truth, episode=1, fps=30.0) + assert isinstance(fig, plt.Figure) + + def test_subplot_count(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + fig = plot_action_deltas(predicted, ground_truth, episode=1, fps=30.0) + assert len(fig.axes) == NUM_JOINTS + + def test_custom_joint_names(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + names = [f"j{i}" for i in range(NUM_JOINTS)] + fig = plot_action_deltas(predicted, ground_truth, episode=1, fps=30.0, joint_names=names) + assert isinstance(fig, plt.Figure) + + +class TestPlotCumulativePositions: + def test_returns_figure(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + fig = plot_cumulative_positions(predicted, ground_truth, episode=2, fps=30.0) + assert isinstance(fig, plt.Figure) + + def test_subplot_count(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + fig = plot_cumulative_positions(predicted, ground_truth, episode=2, fps=30.0) + assert len(fig.axes) == NUM_JOINTS + + +class TestPlotErrorHeatmap: + def test_returns_figure(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + fig = plot_error_heatmap(predicted, ground_truth, episode=3, fps=30.0) + assert isinstance(fig, plt.Figure) + + def test_single_axis(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + fig = plot_error_heatmap(predicted, ground_truth, episode=3, fps=30.0) + # One heatmap axes + one colorbar axes. + assert len(fig.axes) == 2 + + +class TestPlotSummaryPanel: + def test_returns_figure( + self, + action_arrays: tuple[np.ndarray, np.ndarray], + inference_times: np.ndarray, + ) -> None: + predicted, ground_truth = action_arrays + fig = plot_summary_panel(predicted, ground_truth, inference_times, episode=4, fps=30.0) + assert isinstance(fig, plt.Figure) + + def test_2x2_layout( + self, + action_arrays: tuple[np.ndarray, np.ndarray], + inference_times: np.ndarray, + ) -> None: + predicted, ground_truth = action_arrays + fig = plot_summary_panel(predicted, ground_truth, inference_times, episode=4, fps=30.0) + assert len(fig.axes) == 4 + + def test_requires_inference_times(self, action_arrays: tuple[np.ndarray, np.ndarray]) -> None: + predicted, ground_truth = action_arrays + with pytest.raises(TypeError): + plot_summary_panel(predicted, ground_truth, episode=4, fps=30.0) # type: ignore[call-arg] + + +def test_joint_names_default_length() -> None: + assert len(JOINT_NAMES) == NUM_JOINTS + + +class TestPlotAggregateSummary: + @pytest.fixture() + def episode_metrics(self) -> list[dict]: + return [ + { + "episode": 1, + "mse": 0.01, + "mae": 0.05, + "throughput_hz": 120.0, + "avg_inference_ms": 8.3, + "per_joint_mae": [0.04, 0.05, 0.06, 0.03, 0.07, 0.02], + }, + { + "episode": 2, + "mse": 0.02, + "mae": 0.08, + "throughput_hz": 110.0, + "avg_inference_ms": 9.1, + "per_joint_mae": [0.05, 0.06, 0.07, 0.04, 0.08, 0.03], + }, + { + "episode": 3, + "mse": 0.005, + "mae": 0.03, + "throughput_hz": 130.0, + "avg_inference_ms": 7.7, + "per_joint_mae": [0.03, 0.04, 0.05, 0.02, 0.06, 0.01], + }, + ] + + def test_returns_figure(self, episode_metrics) -> None: + fig = plot_aggregate_summary(episode_metrics) + assert isinstance(fig, plt.Figure) + + def test_2x2_layout(self, episode_metrics) -> None: + fig = plot_aggregate_summary(episode_metrics) + assert len(fig.axes) == 4 + + def test_custom_joint_names(self, episode_metrics) -> None: + names = ["j1", "j2", "j3", "j4", "j5", "j6"] + fig = plot_aggregate_summary(episode_metrics, joint_names=names) + ax_top_right = fig.axes[1] + tick_labels = [t.get_text() for t in ax_top_right.get_xticklabels()] + assert tick_labels == names + + def test_uses_default_joint_names(self, episode_metrics) -> None: + fig = plot_aggregate_summary(episode_metrics) + ax_top_right = fig.axes[1] + tick_labels = [t.get_text() for t in ax_top_right.get_xticklabels()] + assert tick_labels == list(JOINT_NAMES) diff --git a/evaluation/tests/test_policy_evaluation.py b/evaluation/tests/test_policy_evaluation.py new file mode 100644 index 00000000..d317bd40 --- /dev/null +++ b/evaluation/tests/test_policy_evaluation.py @@ -0,0 +1,533 @@ +"""Unit tests for ``sil.policy_evaluation``.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +torch = pytest.importorskip("torch") + +from sil import policy_evaluation # noqa: E402 +from sil.policy_evaluation import ( # noqa: E402 + Metrics, + ModelMetadata, + _build_parser, + _load_rsl_rl, + _load_skrl, + evaluate, + find_checkpoint, + load_agent, + load_metadata, + main, +) + + +class TestModelMetadata: + def test_defaults(self) -> None: + meta = ModelMetadata() + assert meta.task == "" + assert meta.framework == "skrl" + assert meta.success_threshold == 0.7 + + def test_custom_values(self) -> None: + meta = ModelMetadata(task="Lift-v0", framework="rsl_rl", success_threshold=0.9) + assert meta.task == "Lift-v0" + assert meta.framework == "rsl_rl" + assert meta.success_threshold == 0.9 + + +class TestLoadMetadata: + def test_auto_task_becomes_empty(self) -> None: + meta = load_metadata(task="auto", framework="skrl", success_threshold=0.7) + assert meta.task == "" + + def test_empty_task_stays_empty(self) -> None: + meta = load_metadata(task="", framework="skrl", success_threshold=0.7) + assert meta.task == "" + + def test_explicit_task_preserved(self) -> None: + meta = load_metadata(task="Reach-v0", framework="rsl_rl", success_threshold=0.5) + assert meta.task == "Reach-v0" + assert meta.framework == "rsl_rl" + assert meta.success_threshold == 0.5 + + def test_negative_threshold_uses_default(self) -> None: + meta = load_metadata(task="Lift-v0", framework="skrl", success_threshold=-1.0) + assert meta.success_threshold == 0.7 + + def test_zero_threshold_kept(self) -> None: + meta = load_metadata(task="Lift-v0", framework="skrl", success_threshold=0.0) + assert meta.success_threshold == 0.0 + + def test_auto_framework_becomes_default(self) -> None: + meta = load_metadata(task="Lift-v0", framework="auto", success_threshold=0.5) + assert meta.framework == "skrl" + + def test_empty_framework_becomes_default(self) -> None: + meta = load_metadata(task="Lift-v0", framework="", success_threshold=0.5) + assert meta.framework == "skrl" + + +class TestMetrics: + def test_empty_to_dict_returns_error(self) -> None: + m = Metrics() + result = m.to_dict() + assert "error" in result + assert result["error"] == "No episodes completed" + + def test_count_starts_at_zero(self) -> None: + m = Metrics() + assert m.count == 0 + + def test_add_increments_count(self) -> None: + m = Metrics() + m.add(reward=10.0, length=50, success=True) + assert m.count == 1 + m.add(reward=20.0, length=60, success=False) + assert m.count == 2 + + def test_to_dict_single_episode(self) -> None: + m = Metrics() + m.add(reward=10.0, length=50, success=True) + result = m.to_dict() + assert result["eval_episodes"] == 1 + assert result["mean_reward"] == 10.0 + assert result["std_reward"] == 0.0 + assert result["mean_length"] == 50.0 + assert result["success_rate"] == 1.0 + + def test_to_dict_multiple_episodes(self) -> None: + m = Metrics() + m.add(reward=10.0, length=50, success=True) + m.add(reward=20.0, length=100, success=False) + result = m.to_dict() + assert result["eval_episodes"] == 2 + assert result["mean_reward"] == pytest.approx(15.0) + assert result["mean_length"] == pytest.approx(75.0) + assert result["success_rate"] == pytest.approx(0.5) + + def test_rewards_and_lengths_tracked(self) -> None: + m = Metrics() + m.add(reward=5.0, length=10, success=False) + m.add(reward=15.0, length=30, success=True) + assert m.rewards == [5.0, 15.0] + assert m.lengths == [10, 30] + assert m.successes == 1 + + +class TestFindCheckpoint: + def test_file_with_pt_extension(self, tmp_path: Path) -> None: + ckpt = tmp_path / "model.pt" + ckpt.write_bytes(b"fake") + assert find_checkpoint(str(ckpt)) == str(ckpt) + + def test_file_with_pth_extension(self, tmp_path: Path) -> None: + ckpt = tmp_path / "model.pth" + ckpt.write_bytes(b"fake") + assert find_checkpoint(str(ckpt)) == str(ckpt) + + def test_bad_extension_raises(self, tmp_path: Path) -> None: + bad = tmp_path / "model.onnx" + bad.write_bytes(b"fake") + with pytest.raises(FileNotFoundError): + find_checkpoint(str(bad)) + + def test_nonexistent_file_raises(self) -> None: + with pytest.raises(FileNotFoundError): + find_checkpoint("/tmp/nonexistent_checkpoint.pt") + + def test_directory_finds_best_agent(self, tmp_path: Path) -> None: + best = tmp_path / "best_agent.pt" + best.write_bytes(b"best") + assert find_checkpoint(str(tmp_path)) == str(best) + + def test_directory_finds_checkpoint_subdir(self, tmp_path: Path) -> None: + ckpt_dir = tmp_path / "checkpoints" + ckpt_dir.mkdir() + ckpt = ckpt_dir / "step_1000.pt" + ckpt.write_bytes(b"data") + assert find_checkpoint(str(tmp_path)) == str(ckpt) + + def test_directory_prefers_best_agent_over_glob(self, tmp_path: Path) -> None: + best = tmp_path / "best_agent.pt" + best.write_bytes(b"best") + other = tmp_path / "other.pt" + other.write_bytes(b"other") + assert find_checkpoint(str(tmp_path)) == str(best) + + def test_directory_selects_newest_by_mtime(self, tmp_path: Path) -> None: + import time + + old = tmp_path / "old.pt" + old.write_bytes(b"old") + time.sleep(0.05) + new = tmp_path / "new.pt" + new.write_bytes(b"new") + result = find_checkpoint(str(tmp_path)) + assert result == str(new) + + def test_empty_directory_raises(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + find_checkpoint(str(tmp_path)) + + +class TestBuildParser: + def test_returns_parser(self) -> None: + parser = _build_parser() + assert isinstance(parser, argparse.ArgumentParser) + + def test_model_path_required(self) -> None: + parser = _build_parser() + with pytest.raises(SystemExit): + parser.parse_args([]) + + def test_defaults(self) -> None: + parser = _build_parser() + args = parser.parse_args(["--model-path", "/tmp/model"]) + assert args.model_path == "/tmp/model" + assert args.task == "" + assert args.framework == "" + assert args.eval_episodes == 100 + assert args.num_envs == 64 + assert args.success_threshold == -1 + assert args.headless is False + assert args.seed == 42 + + def test_all_flags(self) -> None: + parser = _build_parser() + args = parser.parse_args( + [ + "--model-path", + "/m", + "--task", + "Reach-v0", + "--framework", + "rsl_rl", + "--eval-episodes", + "50", + "--num-envs", + "32", + "--success-threshold", + "0.8", + "--headless", + "--seed", + "99", + ] + ) + assert args.task == "Reach-v0" + assert args.framework == "rsl_rl" + assert args.eval_episodes == 50 + assert args.num_envs == 32 + assert args.success_threshold == 0.8 + assert args.headless is True + assert args.seed == 99 + + +class TestLoadAgent: + def test_unsupported_framework_raises(self) -> None: + with pytest.raises(ValueError, match="Unsupported framework"): + load_agent("/tmp/ckpt.pt", "tensorflow", "Reach-v0", MagicMock(), "cuda") + + def test_dispatches_to_skrl_loader(self) -> None: + sentinel = object() + with patch("sil.policy_evaluation._load_skrl", return_value=sentinel) as mock: + result = load_agent("/tmp/ckpt.pt", "skrl", "Reach-v0", MagicMock(), "cuda") + assert result is sentinel + mock.assert_called_once() + + def test_dispatches_to_rsl_rl_loader(self) -> None: + sentinel = object() + with patch("sil.policy_evaluation._load_rsl_rl", return_value=sentinel) as mock: + result = load_agent("/tmp/ckpt.pt", "rsl_rl", "Reach-v0", MagicMock(), "cpu") + assert result is sentinel + mock.assert_called_once_with("/tmp/ckpt.pt", "cpu") + + +class TestLoadRslRl: + def test_loads_actor_critic_and_returns_eval_policy(self) -> None: + rsl_rl_modules = MagicMock() + policy = MagicMock() + policy.to.return_value = policy + rsl_rl_modules.ActorCritic.return_value = policy + checkpoint = {"model_cfg": {"a": 1}, "model_state_dict": {"w": 0}} + + with ( + patch.dict(sys.modules, {"rsl_rl": MagicMock(), "rsl_rl.modules": rsl_rl_modules}), + patch("sil.policy_evaluation.torch.load", return_value=checkpoint) as mock_load, + ): + result = _load_rsl_rl("/tmp/ckpt.pt", "cpu") + + mock_load.assert_called_once_with("/tmp/ckpt.pt", map_location="cpu", weights_only=False) + rsl_rl_modules.ActorCritic.assert_called_once_with(a=1) + policy.load_state_dict.assert_called_once_with({"w": 0}) + policy.eval.assert_called_once() + policy.to.assert_called_once_with("cpu") + assert result is policy + + +class _StubEnv: + """Minimal env stub backed by torch tensors for evaluate() loop.""" + + def __init__(self, num_envs: int, episode_len: int, success: bool = True) -> None: + self.num_envs = num_envs + self.device = "cpu" + self._episode_len = episode_len + self._step_count = 0 + self._success = success + + def reset(self) -> tuple[torch.Tensor, dict]: + self._step_count = 0 + return torch.zeros(self.num_envs, 4), {} + + def step(self, actions): + self._step_count += 1 + rewards = torch.ones(self.num_envs, 1) + done = self._step_count >= self._episode_len + terminated = torch.full((self.num_envs, 1), done, dtype=torch.bool) + truncated = torch.zeros(self.num_envs, 1, dtype=torch.bool) + info = {"success": torch.full((self.num_envs,), self._success, dtype=torch.bool)} + if done: + self._step_count = 0 + return torch.zeros(self.num_envs, 4), rewards, terminated, truncated, info + + +class TestEvaluate: + def test_skrl_path_collects_metrics(self) -> None: + env = _StubEnv(num_envs=2, episode_len=3, success=True) + agent = MagicMock() + agent.act.return_value = (torch.zeros(2, 1),) + + metrics = evaluate(env, agent, num_episodes=2, framework="skrl") + + assert metrics.count == 2 + assert metrics.successes == 2 + assert all(r == pytest.approx(3.0) for r in metrics.rewards) + assert metrics.lengths == [3, 3] + agent.act.assert_called() + + def test_rsl_rl_path_uses_act_inference(self) -> None: + env = _StubEnv(num_envs=2, episode_len=2, success=False) + agent = MagicMock() + agent.act_inference.return_value = torch.zeros(2, 1) + + metrics = evaluate(env, agent, num_episodes=2, framework="rsl_rl") + + assert metrics.count == 2 + assert metrics.successes == 0 + agent.act_inference.assert_called() + agent.act.assert_not_called() + + def test_progress_logging_at_multiples_of_twenty(self) -> None: + env = _StubEnv(num_envs=20, episode_len=1, success=True) + agent = MagicMock() + agent.act.return_value = (torch.zeros(20, 1),) + + metrics = evaluate(env, agent, num_episodes=20, framework="skrl") + assert metrics.count == 20 + assert metrics.successes == 20 + + def test_break_when_count_reaches_num_episodes_mid_step(self) -> None: + # num_envs (3) > num_episodes (2) so the third done index in a single + # step trips the early-exit guard inside the done_indices loop. + env = _StubEnv(num_envs=3, episode_len=1, success=True) + agent = MagicMock() + agent.act.return_value = (torch.zeros(3, 1),) + metrics = evaluate(env, agent, num_episodes=2, framework="skrl") + assert metrics.count == 2 + + def test_truncated_episode_not_counted_as_success(self) -> None: + class TruncEnv(_StubEnv): + def step(self, actions): + self._step_count += 1 + rewards = torch.ones(self.num_envs, 1) + terminated = torch.zeros(self.num_envs, 1, dtype=torch.bool) + truncated = torch.ones(self.num_envs, 1, dtype=torch.bool) + info = {"success": torch.ones(self.num_envs, dtype=torch.bool)} + return torch.zeros(self.num_envs, 4), rewards, terminated, truncated, info + + env = TruncEnv(num_envs=2, episode_len=1) + agent = MagicMock() + agent.act.return_value = (torch.zeros(2, 1),) + + metrics = evaluate(env, agent, num_episodes=2, framework="skrl") + assert metrics.count == 2 + assert metrics.successes == 0 + + +def _skrl_module_stubs(decorator_calls_inner: bool = True, cfg: object | None = None): + """Build sys.modules patch dict for _load_skrl tests.""" + hydra_mod = MagicMock() + if decorator_calls_inner: + hydra_mod.hydra_task_config = lambda task, entry: lambda fn: lambda: fn(None, cfg) + else: + hydra_mod.hydra_task_config = lambda task, entry: lambda fn: lambda: None + runner_mod = MagicMock() + runner_mod.Runner = MagicMock() + return { + "isaaclab_tasks": MagicMock(), + "isaaclab_tasks.utils": MagicMock(), + "isaaclab_tasks.utils.hydra": hydra_mod, + "skrl": MagicMock(), + "skrl.utils": MagicMock(), + "skrl.utils.runner": MagicMock(), + "skrl.utils.runner.torch": runner_mod, + }, runner_mod + + +class TestLoadSkrl: + def test_to_dict_cfg_creates_runner_and_loads_checkpoint(self) -> None: + cfg = MagicMock() + cfg.to_dict.return_value = {"a": 1} + stubs, runner_mod = _skrl_module_stubs(cfg=cfg) + env = MagicMock() + + with patch.dict(sys.modules, stubs): + agent = _load_skrl("/tmp/ckpt.pt", "Reach-v0", env, "cuda") + + runner_mod.Runner.assert_called_once_with(env, {"a": 1}) + runner_instance = runner_mod.Runner.return_value + runner_instance.agent.load.assert_called_once_with("/tmp/ckpt.pt") + runner_instance.agent.enable_training_mode.assert_called_once_with(enabled=False, apply_to_models=True) + assert agent is runner_instance.agent + + def test_dict_cfg_used_directly(self) -> None: + cfg = {"b": 2} + stubs, runner_mod = _skrl_module_stubs(cfg=cfg) + + with patch.dict(sys.modules, stubs): + _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda") + + runner_mod.Runner.assert_called_once() + assert runner_mod.Runner.call_args.args[1] == {"b": 2} + + def test_unsupported_cfg_type_raises(self) -> None: + cfg = object() # no to_dict, not a dict + stubs, _ = _skrl_module_stubs(cfg=cfg) + + with patch.dict(sys.modules, stubs), pytest.raises(ValueError, match="Unexpected agent config type"): + _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda") + + def test_missing_cfg_raises(self) -> None: + stubs, _ = _skrl_module_stubs(decorator_calls_inner=False) + + with patch.dict(sys.modules, stubs), pytest.raises(ValueError, match="Could not load agent configuration"): + _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda") + + def test_restores_sys_argv_after_call(self) -> None: + cfg = {"a": 1} + stubs, _ = _skrl_module_stubs(cfg=cfg) + sentinel_argv = ["prog", "--keep", "me"] + + with patch.dict(sys.modules, stubs), patch.object(sys, "argv", sentinel_argv): + _load_skrl("/tmp/ckpt.pt", "Reach-v0", MagicMock(), "cuda") + assert sys.argv == sentinel_argv + + +def _main_module_stubs(): + """Build sys.modules patch dict for main() tests.""" + isaaclab_app = MagicMock() + gym_mod = MagicMock() + parse_cfg_mod = MagicMock() + skrl_rl_mod = MagicMock() + return { + "isaaclab": MagicMock(), + "isaaclab.app": isaaclab_app, + "isaaclab_tasks": MagicMock(), + "isaaclab_tasks.utils": MagicMock(), + "isaaclab_tasks.utils.parse_cfg": parse_cfg_mod, + "isaaclab_rl": MagicMock(), + "isaaclab_rl.skrl": skrl_rl_mod, + "gymnasium": gym_mod, + } + + +class TestMain: + def test_missing_task_returns_one(self) -> None: + argv = ["prog", "--model-path", "/tmp/m"] + with ( + patch.object(sys, "argv", argv), + patch.object(policy_evaluation.os, "_exit") as mock_exit, + ): + rc = main() + assert rc == 1 + # Early return path: os._exit is only invoked from the try/finally + # that wraps successful evaluation, not from the missing-task guard. + mock_exit.assert_not_called() + + def test_success_path_returns_zero(self) -> None: + argv = ["prog", "--model-path", "/tmp/m", "--task", "Lift-v0", "--success-threshold", "0.5"] + metrics = MagicMock() + metrics.to_dict.return_value = {"success_rate": 0.9} + with ( + patch.object(sys, "argv", argv), + patch.dict(sys.modules, _main_module_stubs()), + patch("sil.policy_evaluation.find_checkpoint", return_value="/tmp/ckpt.pt"), + patch("sil.policy_evaluation.load_agent", return_value=MagicMock()), + patch("sil.policy_evaluation.evaluate", return_value=metrics), + patch("sil.policy_evaluation.prepare_for_shutdown"), + patch.object(policy_evaluation.os, "_exit") as mock_exit, + ): + rc = main() + assert rc == 0 + mock_exit.assert_called_once_with(0) + + def test_below_threshold_returns_one(self) -> None: + argv = ["prog", "--model-path", "/tmp/m", "--task", "Lift-v0", "--success-threshold", "0.9"] + metrics = MagicMock() + metrics.to_dict.return_value = {"success_rate": 0.1} + with ( + patch.object(sys, "argv", argv), + patch.dict(sys.modules, _main_module_stubs()), + patch("sil.policy_evaluation.find_checkpoint", return_value="/tmp/ckpt.pt"), + patch("sil.policy_evaluation.load_agent", return_value=MagicMock()), + patch("sil.policy_evaluation.evaluate", return_value=metrics), + patch("sil.policy_evaluation.prepare_for_shutdown"), + patch.object(policy_evaluation.os, "_exit") as mock_exit, + ): + rc = main() + assert rc == 1 + mock_exit.assert_called_once_with(1) + + def test_rsl_rl_framework_skips_skrl_wrapper(self) -> None: + argv = [ + "prog", + "--model-path", + "/tmp/m", + "--task", + "Lift-v0", + "--framework", + "rsl_rl", + "--success-threshold", + "0.0", + ] + metrics = MagicMock() + metrics.to_dict.return_value = {"success_rate": 1.0} + stubs = _main_module_stubs() + with ( + patch.object(sys, "argv", argv), + patch.dict(sys.modules, stubs), + patch("sil.policy_evaluation.find_checkpoint", return_value="/tmp/ckpt.pt"), + patch("sil.policy_evaluation.load_agent", return_value=MagicMock()), + patch("sil.policy_evaluation.evaluate", return_value=metrics), + patch("sil.policy_evaluation.prepare_for_shutdown"), + patch.object(policy_evaluation.os, "_exit"), + ): + rc = main() + assert rc == 0 + stubs["isaaclab_rl.skrl"].SkrlVecEnvWrapper.assert_not_called() + + def test_exception_in_try_returns_one(self) -> None: + argv = ["prog", "--model-path", "/tmp/m", "--task", "Lift-v0"] + with ( + patch.object(sys, "argv", argv), + patch.dict(sys.modules, _main_module_stubs()), + patch("sil.policy_evaluation.find_checkpoint", side_effect=RuntimeError("boom")), + patch.object(policy_evaluation.os, "_exit") as mock_exit, + ): + rc = main() + assert rc == 1 + mock_exit.assert_called_once_with(1) diff --git a/evaluation/tests/test_policy_runner.py b/evaluation/tests/test_policy_runner.py new file mode 100644 index 00000000..3ef0eac3 --- /dev/null +++ b/evaluation/tests/test_policy_runner.py @@ -0,0 +1,184 @@ +"""Unit tests for ``sil.policy_runner``.""" + +from __future__ import annotations + +import sys +from unittest.mock import MagicMock + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") + +from sil.policy_runner import InferenceMetrics, PolicyRunner, _resolve_device # noqa: E402 +from sil.robot_types import NUM_JOINTS, JointPositionCommand, RobotObservation # noqa: E402 + + +class TestResolveDevice: + """Device resolution with CUDA / MPS / CPU fallback chain.""" + + def test_cuda_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + assert _resolve_device("cuda") == "cuda" + + def test_cuda_falls_back_to_mps(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True) + assert _resolve_device("cuda") == "mps" + + def test_mps_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True) + assert _resolve_device("mps") == "mps" + + def test_mps_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False) + assert _resolve_device("mps") == "cpu" + + def test_cpu_always_returns_cpu(self) -> None: + assert _resolve_device("cpu") == "cpu" + + def test_all_unavailable_returns_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False) + assert _resolve_device("cuda") == "cpu" + + +class TestInferenceMetrics: + """InferenceMetrics dataclass defaults and computed properties.""" + + def test_defaults_are_zero(self) -> None: + m = InferenceMetrics() + assert m.steps == 0 + assert m.total_inference_s == 0.0 + assert m.total_preprocess_s == 0.0 + assert m.chunk_queries == 0 + + def test_avg_inference_ms(self) -> None: + m = InferenceMetrics(steps=10, total_inference_s=0.5) + assert m.avg_inference_ms == pytest.approx(50.0) + + def test_avg_preprocess_ms(self) -> None: + m = InferenceMetrics(steps=10, total_preprocess_s=0.2) + assert m.avg_preprocess_ms == pytest.approx(20.0) + + def test_zero_steps_avoids_division_by_zero(self) -> None: + m = InferenceMetrics() + assert m.avg_inference_ms == 0.0 + assert m.avg_preprocess_ms == 0.0 + + +class TestPolicyRunner: + """PolicyRunner with mock policy and processors.""" + + @pytest.fixture + def action_tensor(self) -> torch.Tensor: + return torch.randn(1, NUM_JOINTS) + + @pytest.fixture + def runner(self, action_tensor: torch.Tensor) -> PolicyRunner: + policy = MagicMock() + policy.select_action.return_value = action_tensor + preprocessor = MagicMock(side_effect=lambda x: x) + postprocessor = MagicMock(return_value={"action": action_tensor}) + return PolicyRunner(policy, preprocessor, postprocessor, "cpu") + + def test_device_property(self, runner: PolicyRunner) -> None: + assert runner.device == "cpu" + + def test_reset_clears_metrics_and_policy(self, runner: PolicyRunner) -> None: + runner._metrics.steps = 5 + runner.reset() + assert runner.metrics.steps == 0 + runner._policy.reset.assert_called_once() + + def test_step_null_image_returns_zeros( + self, + runner: PolicyRunner, + joint_positions: np.ndarray, + ) -> None: + obs = RobotObservation(joint_positions=joint_positions) + cmd = runner.step(obs) + np.testing.assert_array_equal(cmd.positions, np.zeros(NUM_JOINTS, dtype=np.float32)) + assert cmd.timestamp_s == 0.0 + + def test_step_runs_full_pipeline( + self, + joint_positions: np.ndarray, + color_image: np.ndarray, + action_tensor: torch.Tensor, + ) -> None: + policy = MagicMock() + policy.select_action.return_value = action_tensor + preprocessor = MagicMock(side_effect=lambda x: x) + postprocessor = MagicMock(return_value={"action": action_tensor}) + runner = PolicyRunner(policy, preprocessor, postprocessor, "cpu") + + obs = RobotObservation( + joint_positions=joint_positions, + color_image=color_image, + timestamp_s=1.5, + ) + cmd = runner.step(obs) + + assert isinstance(cmd, JointPositionCommand) + assert cmd.positions.shape == (NUM_JOINTS,) + assert cmd.timestamp_s == 1.5 + preprocessor.assert_called_once() + policy.select_action.assert_called_once() + postprocessor.assert_called_once() + + def test_step_increments_metrics( + self, + runner: PolicyRunner, + joint_positions: np.ndarray, + color_image: np.ndarray, + ) -> None: + obs = RobotObservation(joint_positions=joint_positions, color_image=color_image) + runner.step(obs) + runner.step(obs) + assert runner.metrics.steps == 2 + assert runner.metrics.total_inference_s >= 0 + assert runner.metrics.total_preprocess_s >= 0 + + +class TestPolicyRunnerFromPretrained: + """from_pretrained classmethod with mocked lerobot imports.""" + + @pytest.fixture + def lerobot_mocks(self, monkeypatch: pytest.MonkeyPatch) -> tuple[MagicMock, MagicMock]: + mock_act = MagicMock() + mock_pipeline = MagicMock() + for mod in ("lerobot", "lerobot.policies", "lerobot.policies.act", "lerobot.processor"): + monkeypatch.setitem(sys.modules, mod, MagicMock()) + monkeypatch.setitem(sys.modules, "lerobot.policies.act.modeling_act", mock_act) + monkeypatch.setitem(sys.modules, "lerobot.processor.pipeline", mock_pipeline) + return mock_act, mock_pipeline + + def test_loads_and_configures_policy( + self, + monkeypatch: pytest.MonkeyPatch, + lerobot_mocks: tuple[MagicMock, MagicMock], + ) -> None: + mock_act, mock_pipeline = lerobot_mocks + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False) + + runner = PolicyRunner.from_pretrained("test/repo", device="cuda") + + assert runner.device == "cpu" + mock_act.ACTPolicy.from_pretrained.assert_called_once_with("test/repo") + mock_act.ACTPolicy.from_pretrained.return_value.to.assert_called_once_with("cpu") + assert mock_pipeline.PolicyProcessorPipeline.from_pretrained.call_count == 2 + + def test_uses_cuda_when_available( + self, + monkeypatch: pytest.MonkeyPatch, + lerobot_mocks: tuple[MagicMock, MagicMock], + ) -> None: + mock_act, _ = lerobot_mocks + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + + runner = PolicyRunner.from_pretrained("test/repo") + + assert runner.device == "cuda" + mock_act.ACTPolicy.from_pretrained.return_value.to.assert_called_once_with("cuda") diff --git a/evaluation/tests/test_robot_types.py b/evaluation/tests/test_robot_types.py new file mode 100644 index 00000000..f0611057 --- /dev/null +++ b/evaluation/tests/test_robot_types.py @@ -0,0 +1,118 @@ +"""Unit tests for ``sil.robot_types``.""" + +from __future__ import annotations + +import numpy as np +import pytest +from sil.robot_types import ( + IMAGE_CHANNELS, + IMAGE_HEIGHT, + IMAGE_WIDTH, + JOINT_ORDER, + NUM_JOINTS, + JointName, + JointPositionCommand, + RobotObservation, + RobotState, +) + + +class TestJointName: + def test_enum_members(self) -> None: + expected = { + "SHOULDER_PAN", + "SHOULDER_LIFT", + "ELBOW", + "WRIST_1", + "WRIST_2", + "WRIST_3", + } + assert {member.name for member in JointName} == expected + + def test_joint_order_length(self) -> None: + assert len(JOINT_ORDER) == NUM_JOINTS + assert NUM_JOINTS == 6 + + def test_string_values(self) -> None: + for member in JointName: + assert member.value.endswith("_joint") + + +class TestRobotObservation: + def test_valid_construction(self, joint_positions: np.ndarray) -> None: + obs = RobotObservation(joint_positions=joint_positions) + assert obs.joint_positions.shape == (NUM_JOINTS,) + assert obs.color_image is None + assert obs.timestamp_s == 0.0 + + def test_with_color_image(self, joint_positions: np.ndarray, color_image: np.ndarray) -> None: + obs = RobotObservation(joint_positions=joint_positions, color_image=color_image) + assert obs.color_image is not None + assert obs.color_image.shape == (IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS) + + def test_invalid_joint_shape(self) -> None: + with pytest.raises(ValueError, match="joint_positions shape"): + RobotObservation(joint_positions=np.zeros(3)) + + def test_invalid_image_shape(self, joint_positions: np.ndarray) -> None: + with pytest.raises(ValueError, match="color_image shape"): + RobotObservation( + joint_positions=joint_positions, + color_image=np.zeros((10, 10, 3), dtype=np.uint8), + ) + + def test_none_color_image(self, joint_positions: np.ndarray) -> None: + obs = RobotObservation(joint_positions=joint_positions, color_image=None) + assert obs.color_image is None + + +class TestJointPositionCommand: + def test_valid_construction(self, joint_positions: np.ndarray) -> None: + cmd = JointPositionCommand(positions=joint_positions, timestamp_s=1.5) + assert cmd.positions.shape == (NUM_JOINTS,) + assert cmd.timestamp_s == 1.5 + + def test_invalid_shape(self) -> None: + with pytest.raises(ValueError, match="positions shape"): + JointPositionCommand(positions=np.zeros(4)) + + def test_as_absolute(self, rng: np.random.Generator) -> None: + deltas = rng.normal(0, 0.1, size=(NUM_JOINTS,)) + current = rng.normal(0, 1.0, size=(NUM_JOINTS,)) + cmd = JointPositionCommand(positions=deltas) + absolute = cmd.as_absolute(current) + np.testing.assert_allclose(absolute.positions, current + deltas) + + def test_as_absolute_preserves_timestamp(self, joint_positions: np.ndarray) -> None: + cmd = JointPositionCommand(positions=joint_positions, timestamp_s=3.14) + absolute = cmd.as_absolute(np.zeros(NUM_JOINTS)) + assert absolute.timestamp_s == 3.14 + + +class TestRobotState: + def test_default_state(self) -> None: + state = RobotState() + assert state.observation is None + assert state.episode_step == 0 + assert state.is_episode_active is False + assert state.action_queue == [] + + def test_clear_episode(self, joint_positions: np.ndarray) -> None: + state = RobotState( + observation=RobotObservation(joint_positions=joint_positions), + episode_step=42, + is_episode_active=True, + ) + state.clear_episode() + assert state.episode_step == 0 + assert state.is_episode_active is False + + def test_clear_episode_empties_queue(self, joint_positions: np.ndarray) -> None: + state = RobotState( + action_queue=[ + JointPositionCommand(positions=joint_positions), + JointPositionCommand(positions=joint_positions), + ], + ) + state.clear_episode() + assert state.action_queue == [] diff --git a/evaluation/tests/test_run_local_lerobot_eval.py b/evaluation/tests/test_run_local_lerobot_eval.py new file mode 100644 index 00000000..dc012a83 --- /dev/null +++ b/evaluation/tests/test_run_local_lerobot_eval.py @@ -0,0 +1,546 @@ +"""Tests for sil/scripts/run-local-lerobot-eval.py.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +import types +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") + +# Stub heavy / external deps before script import. +if "pyarrow" not in sys.modules: + _pa = types.ModuleType("pyarrow") + _pq = types.ModuleType("pyarrow.parquet") + _pq.read_table = MagicMock() + _pa.parquet = _pq + sys.modules["pyarrow"] = _pa + sys.modules["pyarrow.parquet"] = _pq + +if "av" not in sys.modules: + sys.modules["av"] = types.ModuleType("av") + +for _n in ("lerobot", "lerobot.policies", "lerobot.policies.act"): + sys.modules.setdefault(_n, types.ModuleType(_n)) +sys.modules.setdefault( + "lerobot.policies.act.modeling_act", + types.ModuleType("lerobot.policies.act.modeling_act"), +) +sys.modules.setdefault("safetensors", types.ModuleType("safetensors")) +sys.modules.setdefault("safetensors.torch", types.ModuleType("safetensors.torch")) + +_SCRIPT = Path(__file__).resolve().parents[1] / "sil" / "scripts" / "run-local-lerobot-eval.py" +_spec = importlib.util.spec_from_file_location("run_local_lerobot_eval", _SCRIPT) +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) + + +# ---------------- helpers ---------------- + + +def _make_args(**overrides) -> SimpleNamespace: + defaults = dict( + policy_path="/tmp/policy", + model_name=None, + model_version=None, + dataset_dir="/tmp/ds", + episodes=1, + output_dir="outputs/local-eval", + device="cpu", + ) + defaults.update(overrides) + return SimpleNamespace(**defaults) + + +def _patch_av(monkeypatch: pytest.MonkeyPatch, frames: list[np.ndarray]) -> None: + av_mod = types.ModuleType("av") + + class _Frame: + def __init__(self, arr): + self._arr = arr + + def to_ndarray(self, format="rgb24"): + return self._arr + + class _Stream: + pass + + class _Container: + def __init__(self): + self.streams = SimpleNamespace(video=[_Stream()]) + + def decode(self, _stream): + return [_Frame(f) for f in frames] + + def close(self): + pass + + av_mod.open = lambda _path: _Container() + monkeypatch.setitem(sys.modules, "av", av_mod) + + +def _write_info(dataset_dir: Path, fps: int = 30, action_dim: int = 6, state_dim: int = 6) -> dict: + meta = dataset_dir / "meta" + meta.mkdir(parents=True, exist_ok=True) + info = { + "fps": fps, + "chunks_size": 1000, + "total_episodes": 1, + "features": { + "observation.images.color": {"dtype": "video", "shape": [96, 96, 3]}, + "observation.state": {"dtype": "float32", "shape": [state_dim]}, + "action": {"dtype": "float32", "shape": [action_dim]}, + }, + } + (meta / "info.json").write_text(json.dumps(info)) + return info + + +def _setup_run_evaluation( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + *, + n_frames: int = 4, + n_dims: int = 6, + write_episodes_jsonl: bool = False, +) -> dict: + """Set up a working run_evaluation environment.""" + info = _write_info(tmp_path, action_dim=n_dims, state_dim=n_dims) + + if write_episodes_jsonl: + (tmp_path / "meta" / "episodes.jsonl").write_text('{"episode_index": 0}\n') + + # Create chunk-000/episode_000000.parquet (first candidate). + data_dir = tmp_path / "data" / "chunk-000" + data_dir.mkdir(parents=True) + data_file = data_dir / "episode_000000.parquet" + data_file.write_bytes(b"") + + video_dir = tmp_path / "videos" / "observation.images.color" / "chunk-000" + video_dir.mkdir(parents=True) + video_file = video_dir / "episode_000000.mp4" + video_file.write_bytes(b"") + + # Mock pq.read_table to return synthetic columnar data. + table = MagicMock() + table.column_names = ["timestamp", "observation.state", "action"] + + state_list = [np.zeros(n_dims, dtype=np.float32).tolist() for _ in range(n_frames)] + action_list = [np.zeros(n_dims, dtype=np.float32).tolist() for _ in range(n_frames)] + ts_list = list(range(n_frames)) + + def _getitem(col): + m = MagicMock() + if col == "timestamp": + m.to_pylist.return_value = ts_list + elif col == "observation.state": + m.to_pylist.return_value = state_list + else: + m.to_pylist.return_value = action_list + return m + + table.__getitem__ = lambda self, col: _getitem(col) + monkeypatch.setattr(_mod.pq, "read_table", lambda _path: table) + + # Stub video decoding. + frames = [np.zeros((96, 96, 3), dtype=np.uint8) for _ in range(n_frames)] + monkeypatch.setattr(_mod, "load_video_frames", lambda _p: frames) + + # Stub policy loader. + policy = MagicMock() + policy.parameters.return_value = [torch.zeros(1)] + policy.select_action.return_value = torch.zeros(1, n_dims) + policy.to.return_value = policy + policy.reset = MagicMock() + + act_mod = sys.modules["lerobot.policies.act.modeling_act"] + act_mod.ACTPolicy = SimpleNamespace(from_pretrained=lambda _path: policy) + + monkeypatch.setattr(_mod, "_load_normalizer_stats", lambda *_a, **_k: None) + + return {"policy": policy, "info": info, "frames": frames} + + +# ---------------- TestResolveDevice ---------------- + + +class TestResolveDevice: + def test_cuda_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + assert _mod.resolve_device("cuda") == "cuda" + + def test_cuda_falls_back_to_mps(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True) + assert _mod.resolve_device("cuda") == "mps" + + def test_mps_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True) + assert _mod.resolve_device("mps") == "mps" + + def test_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False) + assert _mod.resolve_device("cuda") == "cpu" + + def test_cpu_explicit(self) -> None: + assert _mod.resolve_device("cpu") == "cpu" + + +# ---------------- TestFindDataFile ---------------- + + +class TestFindDataFile: + def test_first_candidate(self, tmp_path: Path) -> None: + d = tmp_path / "data" / "chunk-000" + d.mkdir(parents=True) + f = d / "episode_000000.parquet" + f.write_bytes(b"") + assert _mod.find_data_file(str(tmp_path), 0, {"chunks_size": 1000}) == str(f) + + def test_second_candidate(self, tmp_path: Path) -> None: + d = tmp_path / "data" / "chunk-007" + d.mkdir(parents=True) + f = d / "file-007.parquet" + f.write_bytes(b"") + assert _mod.find_data_file(str(tmp_path), 7, {"chunks_size": 1000}) == str(f) + + def test_no_candidate_returns_none(self, tmp_path: Path) -> None: + assert _mod.find_data_file(str(tmp_path), 0, {}) is None + + +# ---------------- TestFindVideoFile ---------------- + + +class TestFindVideoFile: + def test_first_candidate(self, tmp_path: Path) -> None: + d = tmp_path / "videos" / "key" / "chunk-000" + d.mkdir(parents=True) + f = d / "episode_000000.mp4" + f.write_bytes(b"") + assert _mod.find_video_file(str(tmp_path), "key", 0, {"chunks_size": 1000}) == str(f) + + def test_second_candidate(self, tmp_path: Path) -> None: + d = tmp_path / "videos" / "key" / "chunk-005" + d.mkdir(parents=True) + f = d / "file-005.mp4" + f.write_bytes(b"") + assert _mod.find_video_file(str(tmp_path), "key", 5, {"chunks_size": 1000}) == str(f) + + def test_no_candidate_returns_none(self, tmp_path: Path) -> None: + assert _mod.find_video_file(str(tmp_path), "key", 0, {}) is None + + +# ---------------- TestLoadVideoFrames ---------------- + + +class TestLoadVideoFrames: + def test_decodes_frames(self, monkeypatch: pytest.MonkeyPatch) -> None: + frames = [np.zeros((4, 4, 3), dtype=np.uint8), np.ones((4, 4, 3), dtype=np.uint8)] + _patch_av(monkeypatch, frames) + result = _mod.load_video_frames("/tmp/x.mp4") + assert len(result) == 2 + assert result[0].shape == (4, 4, 3) + + +# ---------------- TestDownloadAmlModel ---------------- + + +class TestDownloadAmlModel: + def test_finds_safetensors_in_subdir(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, mock_azure_ml) -> None: + download_root = tmp_path / "tmp" / "aml-model-download" + sub = download_root / "my-model" / "pretrained_model" + sub.mkdir(parents=True) + (sub / "model.safetensors").write_bytes(b"") + # Also create empty parent so iterdir returns the subdir. + monkeypatch.chdir(tmp_path) + + mock_ml, _ = mock_azure_ml + client = MagicMock() + mock_ml.MLClient = MagicMock(return_value=client) + + ident_mod = sys.modules["azure.identity"] + ident_mod.DefaultAzureCredential = MagicMock() + + result = _mod.download_aml_model("my-model", "1") + assert result == Path("tmp/aml-model-download/my-model/pretrained_model") + client.models.download.assert_called_once() + + def test_finds_bin_files(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, mock_azure_ml) -> None: + download_root = tmp_path / "tmp" / "aml-model-download" + sub = download_root / "m" / "checkpoint" + sub.mkdir(parents=True) + (sub / "weights.bin").write_bytes(b"") + monkeypatch.chdir(tmp_path) + + mock_ml, _ = mock_azure_ml + mock_ml.MLClient = MagicMock(return_value=MagicMock()) + sys.modules["azure.identity"].DefaultAzureCredential = MagicMock() + + result = _mod.download_aml_model("m", "2") + assert result == Path("tmp/aml-model-download/m/checkpoint") + + def test_returns_download_dir_when_no_match( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, mock_azure_ml + ) -> None: + # No model_name dir created → model_path = download_dir; iterdir on download_dir + # yields nothing matching → loop ends; returns download_dir. + monkeypatch.chdir(tmp_path) + mock_ml, _ = mock_azure_ml + mock_ml.MLClient = MagicMock(return_value=MagicMock()) + sys.modules["azure.identity"].DefaultAzureCredential = MagicMock() + + result = _mod.download_aml_model("missing", "9") + # Falls through to download_dir which has no matching files; the loop iterates + # over [download_dir] only because model_path.is_dir() is True but no glob + # matches; result remains download_dir. + assert result == Path("tmp/aml-model-download") + + +# ---------------- TestLoadNormalizerStats ---------------- + + +class TestLoadNormalizerStats: + def test_no_files_returns_early(self, tmp_path: Path) -> None: + policy = MagicMock() + _mod._load_normalizer_stats(policy, tmp_path) + policy.load_state_dict.assert_not_called() + + def test_skips_non_processor_files(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + (tmp_path / "model.safetensors").write_bytes(b"") + st = sys.modules["safetensors.torch"] + st.load_file = MagicMock(return_value={"observation.state.mean": torch.zeros(6)}) + policy = MagicMock() + _mod._load_normalizer_stats(policy, tmp_path) + # File present but lacks "processor" → stats stays empty → early return. + policy.load_state_dict.assert_not_called() + + def test_loads_matching_buffers(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + (tmp_path / "preprocessor.safetensors").write_bytes(b"") + st = sys.modules["safetensors.torch"] + mean_t = torch.ones(6) + std_t = torch.full((6,), 2.0) + bad_short = torch.zeros(1) + st.load_file = MagicMock( + return_value={ + "observation.state.mean": mean_t, + "observation.state.std": std_t, + "observation.state.median": bad_short, # not in {mean,std,min,max} + "noseparator": bad_short, # rsplit gives 1 part + } + ) + policy = MagicMock() + policy.state_dict.return_value = { + "normalize_inputs.buffer_observation_state.mean": torch.zeros(6), + "normalize_inputs.buffer_observation_state.std": torch.zeros(6), + } + _mod._load_normalizer_stats(policy, tmp_path) + policy.load_state_dict.assert_called_once() + kwargs = policy.load_state_dict.call_args + assert kwargs.kwargs.get("strict") is False or kwargs.args[1:] == (False,) + + def test_no_matching_buffers_skips_load(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + (tmp_path / "preprocessor.safetensors").write_bytes(b"") + st = sys.modules["safetensors.torch"] + st.load_file = MagicMock(return_value={"observation.state.mean": torch.ones(6)}) + policy = MagicMock() + # state_dict has no matching buffer name. + policy.state_dict.return_value = {"unrelated.weight": torch.zeros(2)} + _mod._load_normalizer_stats(policy, tmp_path) + policy.load_state_dict.assert_not_called() + + +# ---------------- TestRunEvaluation ---------------- + + +class TestRunEvaluation: + def test_happy_path_writes_results(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5) + out = tmp_path / "out" + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(out), episodes=1) + _mod.run_evaluation(args) + assert (out / "eval_results.json").exists() + assert (out / "ep000_predictions.npz").exists() + assert (out / "plots" / "ep000_action_deltas.png").exists() + + def test_strips_config_fields(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5) + policy_dir = tmp_path / "policy" + policy_dir.mkdir() + cfg = {"use_peft": True, "pretrained_path": "x", "peft_config": {}, "keep": 1} + (policy_dir / "config.json").write_text(json.dumps(cfg)) + args = _make_args( + dataset_dir=str(tmp_path), + output_dir=str(tmp_path / "out"), + policy_path=str(policy_dir), + ) + _mod.run_evaluation(args) + new_cfg = json.loads((policy_dir / "config.json").read_text()) + assert "use_peft" not in new_cfg + assert new_cfg["keep"] == 1 + + def test_config_without_strip_fields_unchanged(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5) + policy_dir = tmp_path / "policy" + policy_dir.mkdir() + (policy_dir / "config.json").write_text(json.dumps({"keep": 1})) + args = _make_args( + dataset_dir=str(tmp_path), + output_dir=str(tmp_path / "out"), + policy_path=str(policy_dir), + ) + _mod.run_evaluation(args) + assert json.loads((policy_dir / "config.json").read_text()) == {"keep": 1} + + def test_skip_no_data(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4) + # Remove the data file → find_data_file returns None. + (tmp_path / "data" / "chunk-000" / "episode_000000.parquet").unlink() + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out")) + _mod.run_evaluation(args) + # No metrics → early return, no eval_results.json. + assert not (tmp_path / "out" / "eval_results.json").exists() + + def test_skip_no_video(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4) + (tmp_path / "videos" / "observation.images.color" / "chunk-000" / "episode_000000.mp4").unlink() + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out")) + _mod.run_evaluation(args) + assert not (tmp_path / "out" / "eval_results.json").exists() + + def test_image_key_fallback(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + # Write info with no video/image features → falls back to default key. + meta = tmp_path / "meta" + meta.mkdir() + info = { + "fps": 30, + "chunks_size": 1000, + "total_episodes": 1, + "features": { + "observation.state": {"dtype": "float32", "shape": [6]}, + "action": {"dtype": "float32", "shape": [6]}, + }, + } + (meta / "info.json").write_text(json.dumps(info)) + # No data file → skipped, but path through image_key fallback must execute. + policy = MagicMock() + policy.parameters.return_value = [torch.zeros(1)] + policy.to.return_value = policy + sys.modules["lerobot.policies.act.modeling_act"].ACTPolicy = SimpleNamespace(from_pretrained=lambda _p: policy) + monkeypatch.setattr(_mod, "_load_normalizer_stats", lambda *_a, **_k: None) + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out")) + _mod.run_evaluation(args) + + def test_episodes_jsonl_total(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4, write_episodes_jsonl=True) + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out"), episodes=10) + _mod.run_evaluation(args) + assert (tmp_path / "out" / "eval_results.json").exists() + + def test_n_dims_one_branch(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4, n_dims=1) + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out")) + _mod.run_evaluation(args) + assert (tmp_path / "out" / "eval_results.json").exists() + + def test_many_dims_small_labelsize(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4, n_dims=10) + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out")) + _mod.run_evaluation(args) + assert (tmp_path / "out" / "eval_results.json").exists() + + def test_step_print_at_end(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + # n_frames=5 → num_steps=4 → step iterates 0..3, both branches of `step<3 or last` fire. + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=5) + args = _make_args(dataset_dir=str(tmp_path), output_dir=str(tmp_path / "out")) + _mod.run_evaluation(args) + + def test_aml_branch_invokes_download(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_evaluation(monkeypatch, tmp_path, n_frames=4) + called = {} + + def fake_download(name, version): + called["name"] = name + called["version"] = version + return tmp_path / "downloaded" + + monkeypatch.setattr(_mod, "download_aml_model", fake_download) + args = _make_args( + dataset_dir=str(tmp_path), + output_dir=str(tmp_path / "out"), + policy_path=None, + model_name="m", + model_version="1", + ) + _mod.run_evaluation(args) + assert called["name"] == "m" and called["version"] == "1" + + +# ---------------- TestMain ---------------- + + +class TestMain: + def test_no_policy_source_errors(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setattr(sys, "argv", ["run-local-lerobot-eval", "--dataset-dir", str(tmp_path)]) + with pytest.raises(SystemExit): + _mod.main() + + def test_missing_dataset_exits(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setattr( + sys, + "argv", + [ + "run-local-lerobot-eval", + "--policy-path", + "/x", + "--dataset-dir", + str(tmp_path / "missing"), + ], + ) + with pytest.raises(SystemExit) as exc: + _mod.main() + assert exc.value.code == 1 + + def test_invokes_run_evaluation_policy_path(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + called = {} + monkeypatch.setattr(_mod, "run_evaluation", lambda a: called.setdefault("a", a)) + monkeypatch.setattr( + sys, + "argv", + [ + "run-local-lerobot-eval", + "--policy-path", + "/x", + "--dataset-dir", + str(tmp_path), + ], + ) + _mod.main() + assert called["a"].policy_path == "/x" + + def test_invokes_run_evaluation_aml(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + called = {} + monkeypatch.setattr(_mod, "run_evaluation", lambda a: called.setdefault("a", a)) + monkeypatch.setattr( + sys, + "argv", + [ + "run-local-lerobot-eval", + "--model-name", + "m", + "--model-version", + "2", + "--dataset-dir", + str(tmp_path), + ], + ) + _mod.main() + assert called["a"].model_name == "m" diff --git a/evaluation/tests/test_test_lerobot_eval.py b/evaluation/tests/test_test_lerobot_eval.py new file mode 100644 index 00000000..219f3c21 --- /dev/null +++ b/evaluation/tests/test_test_lerobot_eval.py @@ -0,0 +1,246 @@ +"""Unit tests for ``sil/scripts/test-lerobot-eval.py``.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +import types +from pathlib import Path +from unittest.mock import MagicMock + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") + +# Stub heavy/native deps before loading the script. +if "pyarrow" not in sys.modules: + _pa = types.ModuleType("pyarrow") + _pa_pq = types.ModuleType("pyarrow.parquet") + _pa_pq.read_table = MagicMock() + _pa.parquet = _pa_pq # type: ignore[attr-defined] + sys.modules["pyarrow"] = _pa + sys.modules["pyarrow.parquet"] = _pa_pq + +if "av" not in sys.modules: + sys.modules["av"] = types.ModuleType("av") + +# lerobot.* imports happen inside run_inference_test only; ensure stubs exist. +for _name in ("lerobot", "lerobot.policies", "lerobot.policies.act", "lerobot.processor"): + sys.modules.setdefault(_name, types.ModuleType(_name)) +sys.modules.setdefault("lerobot.policies.act.modeling_act", types.ModuleType("lerobot.policies.act.modeling_act")) +sys.modules.setdefault("lerobot.processor.pipeline", types.ModuleType("lerobot.processor.pipeline")) + +_SCRIPT = Path(__file__).resolve().parents[1] / "sil" / "scripts" / "test-lerobot-eval.py" +_spec = importlib.util.spec_from_file_location("test_lerobot_eval_script", _SCRIPT) +assert _spec and _spec.loader +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) + + +# --------------------------------------------------------------------------- +# Pure helpers +# --------------------------------------------------------------------------- + + +class TestResolveDevice: + def test_cuda_available(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + assert _mod.resolve_device("cuda") == "cuda" + + def test_cuda_falls_back_to_mps(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True) + assert _mod.resolve_device("cuda") == "mps" + + def test_mps_when_available(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: True) + assert _mod.resolve_device("mps") == "mps" + + def test_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.backends.mps, "is_available", lambda: False) + assert _mod.resolve_device("cuda") == "cpu" + + def test_cpu_explicit(self) -> None: + assert _mod.resolve_device("cpu") == "cpu" + + +class TestBuildObservation: + def test_returns_expected_keys_and_shapes(self) -> None: + state = np.zeros(6, dtype=np.float32) + image = np.zeros((4, 4, 3), dtype=np.uint8) + obs = _mod.build_observation(state, image) + assert set(obs.keys()) == {"observation.state", "observation.images.color"} + assert obs["observation.state"].shape == (6,) + assert obs["observation.images.color"].shape == (3, 4, 4) + assert float(obs["observation.images.color"].max()) <= 1.0 + + +class TestLoadEpisodeData: + def test_returns_dict_of_columns(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + table = MagicMock() + table.column_names = ["timestamp", "action"] + table.__getitem__.side_effect = lambda k: MagicMock(to_pylist=lambda: [1, 2, 3]) + monkeypatch.setattr(_mod.pq, "read_table", lambda _p: table) + result = _mod.load_episode_data(str(tmp_path), 0) + assert result == {"timestamp": [1, 2, 3], "action": [1, 2, 3]} + + +class TestLoadVideoFrame: + def _patch_av(self, monkeypatch: pytest.MonkeyPatch, frames: list[np.ndarray]) -> MagicMock: + av_mod = MagicMock() + container = MagicMock() + stream = MagicMock() + container.streams.video = [stream] + + def make_av_frame(arr: np.ndarray) -> MagicMock: + f = MagicMock() + f.to_ndarray.return_value = arr + return f + + container.decode.return_value = [make_av_frame(a) for a in frames] + av_mod.open.return_value = container + monkeypatch.setitem(sys.modules, "av", av_mod) + return container + + def test_returns_target_frame(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + frames = [np.full((2, 2, 3), i, dtype=np.uint8) for i in range(3)] + container = self._patch_av(monkeypatch, frames) + result = _mod.load_video_frame(str(tmp_path), 0, 1) + assert result[0, 0, 0] == 1 + container.close.assert_called() + + def test_missing_frame_raises(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + self._patch_av(monkeypatch, [np.zeros((2, 2, 3), dtype=np.uint8)]) + with pytest.raises(IndexError): + _mod.load_video_frame(str(tmp_path), 0, 99) + + +# --------------------------------------------------------------------------- +# main / run_inference_test +# --------------------------------------------------------------------------- + + +def _make_args(**overrides) -> object: + defaults = dict( + policy_repo="repo", + dataset_dir="/tmp/ds", + episode=0, + start_frame=0, + num_steps=2, + device="cpu", + output=None, + ) + defaults.update(overrides) + return types.SimpleNamespace(**defaults) + + +def _setup_run_inference_test(monkeypatch: pytest.MonkeyPatch, tmp_path: Path, num_frames: int = 5) -> dict: + """Patch all heavy dependencies of run_inference_test and return the mocks.""" + # Mock ACTPolicy + PolicyProcessorPipeline via the modules already in sys.modules. + act_mod = sys.modules["lerobot.policies.act.modeling_act"] + pipeline_mod = sys.modules["lerobot.processor.pipeline"] + + policy = MagicMock() + # parameters() must yield tensor-likes with .numel() + param = MagicMock() + param.numel.return_value = 1_000_000 + policy.parameters.return_value = [param, param] + act_policy_cls = MagicMock() + act_policy_cls.from_pretrained.return_value = policy + monkeypatch.setattr(act_mod, "ACTPolicy", act_policy_cls, raising=False) + + preprocessor = MagicMock(side_effect=lambda x: x) + preprocessor.steps = [] + postprocessor = MagicMock(return_value={"action": torch.zeros(1, 6)}) + postprocessor.steps = [] + pipeline_cls = MagicMock() + pipeline_cls.from_pretrained.side_effect = [preprocessor, postprocessor] + monkeypatch.setattr(pipeline_mod, "PolicyProcessorPipeline", pipeline_cls, raising=False) + + # Dataset info file. + info = { + "fps": 30, + "features": { + "action": {"shape": [6]}, + "observation.state": {"shape": [6]}, + "observation.images.color": {"shape": [3, 480, 640]}, + }, + } + meta_dir = tmp_path / "meta" + meta_dir.mkdir() + (meta_dir / "info.json").write_text(json.dumps(info)) + + # Episode parquet data. + ep_data = { + "timestamp": list(range(num_frames)), + "observation.state": [[0.0] * 6 for _ in range(num_frames)], + "action": [[0.1] * 6 for _ in range(num_frames)], + } + monkeypatch.setattr(_mod, "load_episode_data", lambda d, e: ep_data) + monkeypatch.setattr( + _mod, + "load_video_frame", + lambda d, e, f: np.zeros((480, 640, 3), dtype=np.uint8), + ) + + # Force device to cpu. + monkeypatch.setattr(_mod, "resolve_device", lambda r: "cpu") + return {"policy": policy, "preprocessor": preprocessor, "postprocessor": postprocessor} + + +class TestRunInferenceTest: + def test_completes_and_writes_output(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_inference_test(monkeypatch, tmp_path) + out_file = tmp_path / "preds.npz" + args = _make_args(dataset_dir=str(tmp_path), num_steps=10, output=str(out_file)) + _mod.run_inference_test(args) + assert out_file.exists() + + def test_no_output_skips_save(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_inference_test(monkeypatch, tmp_path) + args = _make_args(dataset_dir=str(tmp_path), num_steps=2, output=None) + _mod.run_inference_test(args) + + def test_warns_on_degenerate_outputs(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys) -> None: + mocks = _setup_run_inference_test(monkeypatch, tmp_path) + # Force NaN + Inf + zero-variance predictions. + bad = torch.full((1, 6), float("nan")) + bad[0, 0] = float("inf") + mocks["postprocessor"].return_value = {"action": bad} + args = _make_args(dataset_dir=str(tmp_path), num_steps=3) + _mod.run_inference_test(args) + out = capsys.readouterr().out + assert "NaN" in out + assert "Inf" in out + + def test_zero_variance_warning(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, capsys) -> None: + _setup_run_inference_test(monkeypatch, tmp_path) + # Default postprocessor returns zeros each step → zero variance. + args = _make_args(dataset_dir=str(tmp_path), num_steps=3) + _mod.run_inference_test(args) + out = capsys.readouterr().out + assert "mode collapse" in out + + +class TestMain: + def test_missing_dataset_exits(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setattr(sys, "argv", ["test-lerobot-eval", "--dataset-dir", str(tmp_path / "missing")]) + with pytest.raises(SystemExit) as exc_info: + _mod.main() + assert exc_info.value.code == 1 + + def test_invokes_run_inference(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + _setup_run_inference_test(monkeypatch, tmp_path) + called = {} + + def fake_run(args): + called["args"] = args + + monkeypatch.setattr(_mod, "run_inference_test", fake_run) + monkeypatch.setattr(sys, "argv", ["test-lerobot-eval", "--dataset-dir", str(tmp_path), "--num-steps", "1"]) + _mod.main() + assert called["args"].dataset_dir == str(tmp_path) diff --git a/evaluation/tests/test_upload_artifacts.py b/evaluation/tests/test_upload_artifacts.py new file mode 100644 index 00000000..ca758ccc --- /dev/null +++ b/evaluation/tests/test_upload_artifacts.py @@ -0,0 +1,656 @@ +"""Unit tests for ``metrics.upload_artifacts``.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +from metrics.upload_artifacts import ( + get_video_search_paths, + load_metrics, + main, + upload_to_blob_fallback, + upload_to_mlflow, +) + + +class TestLoadMetrics: + def test_both_files_exist(self, tmp_path: Path) -> None: + onnx_payload = {"latency_ms": 12.3} + jit_payload = {"latency_ms": 45.6} + (tmp_path / "onnx_metrics.json").write_text(json.dumps(onnx_payload)) + (tmp_path / "jit_metrics.json").write_text(json.dumps(jit_payload)) + + onnx, jit = load_metrics(tmp_path) + + assert onnx == onnx_payload + assert jit == jit_payload + + def test_only_onnx(self, tmp_path: Path) -> None: + (tmp_path / "onnx_metrics.json").write_text(json.dumps({"a": 1})) + + onnx, jit = load_metrics(tmp_path) + + assert onnx == {"a": 1} + assert jit == {} + + def test_only_jit(self, tmp_path: Path) -> None: + (tmp_path / "jit_metrics.json").write_text(json.dumps({"b": 2})) + + onnx, jit = load_metrics(tmp_path) + + assert onnx == {} + assert jit == {"b": 2} + + def test_neither_exists(self, tmp_path: Path) -> None: + onnx, jit = load_metrics(tmp_path) + + assert onnx == {} + assert jit == {} + + +class TestGetVideoSearchPaths: + def test_returns_five_paths(self, tmp_path: Path) -> None: + paths = get_video_search_paths(tmp_path / "export") + assert len(paths) == 5 + assert all(isinstance(p, Path) for p in paths) + + def test_first_path_is_relative(self, tmp_path: Path) -> None: + export_dir = tmp_path / "export" + paths = get_video_search_paths(export_dir) + assert paths[0] == export_dir / "videos" + + def test_parent_videos_path(self, tmp_path: Path) -> None: + export_dir = tmp_path / "nested" / "export" + paths = get_video_search_paths(export_dir) + assert paths[1] == tmp_path / "nested" / "videos" + + +class TestUploadToMlflow: + @staticmethod + def _inject_mock_modules(monkeypatch): + """Inject mock mlflow and training.utils into sys.modules.""" + mock_mlflow = MagicMock() + mock_run = MagicMock() + mock_run.info.run_id = "run-123" + mock_mlflow.start_run.return_value.__enter__ = MagicMock(return_value=mock_run) + mock_mlflow.start_run.return_value.__exit__ = MagicMock(return_value=False) + + config_error = type("AzureConfigError", (RuntimeError,), {}) + mock_utils = MagicMock() + mock_utils.AzureConfigError = config_error + + context = MagicMock() + context.workspace_name = "ws" + context.tracking_uri = "https://tracking" + context.storage = None + mock_utils.bootstrap_azure_ml.return_value = context + + monkeypatch.setitem(sys.modules, "mlflow", mock_mlflow) + monkeypatch.setitem(sys.modules, "training", MagicMock()) + monkeypatch.setitem(sys.modules, "training.utils", mock_utils) + + return mock_mlflow, mock_utils, config_error + + def test_import_error_returns_false(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setitem(sys.modules, "mlflow", None) + result = upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="", + onnx_success=False, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert result is False + + def test_azure_config_error_returns_false(self, tmp_path: Path, monkeypatch) -> None: + _, mock_utils, config_error = self._inject_mock_modules(monkeypatch) + mock_utils.bootstrap_azure_ml.side_effect = config_error("nope") + result = upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="", + onnx_success=False, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert result is False + + def test_connection_error_returns_false(self, tmp_path: Path, monkeypatch) -> None: + self._inject_mock_modules(monkeypatch) + sys.modules["training.utils"].bootstrap_azure_ml.side_effect = ConnectionError("refused") + result = upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="", + onnx_success=False, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert result is False + + def test_success_returns_true(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "both") + + mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch) + result = upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=True, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert result is True + mock_mlflow.start_run.assert_called_once() + mock_mlflow.set_tags.assert_called_once() + mock_mlflow.log_params.assert_called_once() + + def test_onnx_metrics_logged(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "onnx") + + mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch) + upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=True, + jit_success=False, + onnx_metrics={"mean_episode_reward": 5.0, "total_episodes": 10}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert mock_mlflow.log_metrics.call_count >= 2 + mock_mlflow.log_artifact.assert_called() + + def test_storage_upload_called(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "both") + + _, mock_utils, _ = self._inject_mock_modules(monkeypatch) + mock_storage = MagicMock() + mock_utils.bootstrap_azure_ml.return_value.storage = mock_storage + + (tmp_path / "policy.onnx").write_bytes(b"data") + upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=True, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + mock_storage.upload_files_batch.assert_called_once() + + def test_storage_no_files_to_upload(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "both") + + _, mock_utils, _ = self._inject_mock_modules(monkeypatch) + mock_storage = MagicMock() + mock_utils.bootstrap_azure_ml.return_value.storage = mock_storage + + export_dir = tmp_path / "empty_export" + export_dir.mkdir() + result = upload_to_mlflow( + task="t", + export_dir=export_dir, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=False, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert result is True + mock_storage.upload_files_batch.assert_not_called() + + def test_jit_metrics_logged(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "jit") + + mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch) + upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=False, + jit_success=True, + onnx_metrics={}, + jit_metrics={"mean_episode_reward": 7.0, "total_episodes": 5}, + timestamp="20240101_000000", + ) + + logged_keys = set() + for call in mock_mlflow.log_metrics.call_args_list: + logged_keys.update(call[0][0]) + assert "jit/mean_episode_reward" in logged_keys + mock_mlflow.log_artifact.assert_called() + + def test_video_path_classification(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "both") + + export_dir = tmp_path / "export" + videos_dir = export_dir / "videos" + videos_dir.mkdir(parents=True) + (videos_dir / "onnx_run.mp4").write_bytes(b"\x00") + (videos_dir / "jit_run.mp4").write_bytes(b"\x00") + (videos_dir / "general_run.mp4").write_bytes(b"\x00") + + mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch) + upload_to_mlflow( + task="t", + export_dir=export_dir, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=True, + jit_success=True, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + + artifact_paths = [c.kwargs["artifact_path"] for c in mock_mlflow.log_artifact.call_args_list] + assert "videos/onnx" in artifact_paths + assert "videos/jit" in artifact_paths + assert "videos" in artifact_paths + + def test_storage_upload_includes_videos(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "both") + + _, mock_utils, _ = self._inject_mock_modules(monkeypatch) + mock_storage = MagicMock() + mock_utils.bootstrap_azure_ml.return_value.storage = mock_storage + + export_dir = tmp_path / "export" + export_dir.mkdir() + (export_dir / "policy.onnx").write_bytes(b"data") + videos_dir = export_dir / "videos" + videos_dir.mkdir() + (videos_dir / "test.mp4").write_bytes(b"\x00") + + upload_to_mlflow( + task="t", + export_dir=export_dir, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=True, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + + files = mock_storage.upload_files_batch.call_args[0][0] + blob_names = [f[1] for f in files] + assert any("models/policy.onnx" in n for n in blob_names) + assert any("videos/test.mp4" in n for n in blob_names) + + def test_mlflow_run_exception_returns_false(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("NUM_ENVS", "4") + monkeypatch.setenv("MAX_STEPS", "500") + monkeypatch.setenv("VIDEO_LENGTH", "200") + monkeypatch.setenv("INFERENCE_FORMAT", "both") + + mock_mlflow, _, _ = self._inject_mock_modules(monkeypatch) + mock_mlflow.start_run.side_effect = RuntimeError("boom") + + result = upload_to_mlflow( + task="t", + export_dir=tmp_path, + metrics_dir=tmp_path, + checkpoint_uri="uri", + onnx_success=True, + jit_success=False, + onnx_metrics={}, + jit_metrics={}, + timestamp="20240101_000000", + ) + assert result is False + + +class TestUploadToBlobFallback: + @staticmethod + def _inject_azure_mocks(monkeypatch): + """Inject mock azure.identity and azure.storage.blob into sys.modules.""" + mock_credential = MagicMock() + mock_identity = MagicMock() + mock_identity.DefaultAzureCredential.return_value = mock_credential + + mock_container = MagicMock() + mock_blob_service = MagicMock() + mock_blob_service.get_container_client.return_value = mock_container + + mock_blob = MagicMock() + mock_blob.BlobServiceClient.return_value = mock_blob_service + + monkeypatch.setitem(sys.modules, "azure", MagicMock()) + monkeypatch.setitem(sys.modules, "azure.identity", mock_identity) + monkeypatch.setitem(sys.modules, "azure.storage", MagicMock()) + monkeypatch.setitem(sys.modules, "azure.storage.blob", mock_blob) + + return mock_blob, mock_container + + def test_no_storage_returns_false(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.delenv("AZURE_STORAGE_ACCOUNT_NAME", raising=False) + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="", + blob_container="", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is False + + def test_import_error_returns_false(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setitem(sys.modules, "azure", None) + monkeypatch.setitem(sys.modules, "azure.identity", None) + monkeypatch.setitem(sys.modules, "azure.storage", None) + monkeypatch.setitem(sys.modules, "azure.storage.blob", None) + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="myaccount", + blob_container="mycontainer", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is False + + def test_url_parsing_extracts_account(self, tmp_path: Path, monkeypatch) -> None: + mock_blob, _ = self._inject_azure_mocks(monkeypatch) + upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="", + blob_container="", + checkpoint_uri="https://myaccount.blob.core.windows.net/mycontainer/path/model.pt", + timestamp="20240101_000000", + ) + call_args = str(mock_blob.BlobServiceClient.call_args) + assert "myaccount" in call_args + + def test_https_url_non_blob_falls_through(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.delenv("AZURE_STORAGE_ACCOUNT_NAME", raising=False) + monkeypatch.delenv("AZURE_STORAGE_CONTAINER_NAME", raising=False) + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="", + blob_container="", + checkpoint_uri="https://example.com/not-a-blob/file", + timestamp="20240101_000000", + ) + assert result is False + + def test_env_var_fallback(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setenv("AZURE_STORAGE_ACCOUNT_NAME", "envaccount") + monkeypatch.setenv("AZURE_STORAGE_CONTAINER_NAME", "envcontainer") + mock_blob, _ = self._inject_azure_mocks(monkeypatch) + upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="", + blob_container="", + checkpoint_uri="", + timestamp="20240101_000000", + ) + call_args = str(mock_blob.BlobServiceClient.call_args) + assert "envaccount" in call_args + + def test_success_with_policy_files(self, tmp_path: Path, monkeypatch) -> None: + (tmp_path / "policy.onnx").write_bytes(b"model-data") + _, mock_container = self._inject_azure_mocks(monkeypatch) + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="acct", + blob_container="ctr", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is True + mock_container.upload_blob.assert_called() + + def test_no_files_returns_false(self, tmp_path: Path, monkeypatch) -> None: + self._inject_azure_mocks(monkeypatch) + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="acct", + blob_container="ctr", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is False + + def test_per_file_upload_exception_continues(self, tmp_path: Path, monkeypatch) -> None: + (tmp_path / "policy.onnx").write_bytes(b"model-onnx") + (tmp_path / "policy.jit").write_bytes(b"model-jit") + _, mock_container = self._inject_azure_mocks(monkeypatch) + mock_container.upload_blob.side_effect = [Exception("fail"), None] + + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="acct", + blob_container="ctr", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is True + assert mock_container.upload_blob.call_count == 2 + + def test_video_upload_success_and_exception(self, tmp_path: Path, monkeypatch) -> None: + (tmp_path / "policy.onnx").write_bytes(b"model-data") + videos_dir = tmp_path / "videos" + videos_dir.mkdir() + (videos_dir / "ep1.mp4").write_bytes(b"video-1") + (videos_dir / "ep2.mp4").write_bytes(b"video-2") + _, mock_container = self._inject_azure_mocks(monkeypatch) + + # First call (model) succeeds, then one video succeeds and one raises. + mock_container.upload_blob.side_effect = [None, None, Exception("video-fail")] + + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="acct", + blob_container="ctr", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is True + # 1 model upload + 2 video upload attempts. + assert mock_container.upload_blob.call_count == 3 + blob_names = [call.kwargs.get("name", "") for call in mock_container.upload_blob.call_args_list] + assert any("videos/ep1.mp4" in name for name in blob_names) + assert any("videos/ep2.mp4" in name for name in blob_names) + + def test_credential_exception_returns_false(self, tmp_path: Path, monkeypatch) -> None: + mock_identity = MagicMock() + mock_identity.DefaultAzureCredential.side_effect = Exception("auth failed") + monkeypatch.setitem(sys.modules, "azure", MagicMock()) + monkeypatch.setitem(sys.modules, "azure.identity", mock_identity) + monkeypatch.setitem(sys.modules, "azure.storage", MagicMock()) + monkeypatch.setitem(sys.modules, "azure.storage.blob", MagicMock()) + + result = upload_to_blob_fallback( + task="t", + export_dir=tmp_path, + blob_account="acct", + blob_container="ctr", + checkpoint_uri="", + timestamp="20240101_000000", + ) + assert result is False + + +class TestMain: + """Tests for the main() entry point.""" + + @staticmethod + def _env_vars(tmp_path: Path) -> dict[str, str]: + return { + "TASK": "pick_place", + "EXPORT_DIR": str(tmp_path / "exported"), + "METRICS_DIR": str(tmp_path / "metrics"), + "ONNX_SUCCESS": "1", + "JIT_SUCCESS": "0", + "NUM_ENVS": "4", + "MAX_STEPS": "500", + "VIDEO_LENGTH": "200", + "INFERENCE_FORMAT": "both", + "CHECKPOINT_URI": "https://example.com/model.pt", + "BLOB_STORAGE_ACCOUNT": "acct", + "BLOB_CONTAINER": "ctr", + } + + def test_mlflow_success_skips_blob_fallback(self, tmp_path: Path, monkeypatch) -> None: + mock_set_defaults = MagicMock() + monkeypatch.setitem(sys.modules, "training", MagicMock()) + monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=mock_set_defaults)) + + for key, val in self._env_vars(tmp_path).items(): + monkeypatch.setenv(key, val) + + with ( + patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})) as mock_load, + patch("metrics.upload_artifacts.upload_to_mlflow", return_value=True) as mock_mlflow, + patch("metrics.upload_artifacts.upload_to_blob_fallback") as mock_blob, + ): + main() + + mock_set_defaults.assert_called_once() + mock_load.assert_called_once() + mock_mlflow.assert_called_once() + mock_blob.assert_not_called() + + def test_mlflow_failure_triggers_blob_fallback(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setitem(sys.modules, "training", MagicMock()) + monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=MagicMock())) + + for key, val in self._env_vars(tmp_path).items(): + monkeypatch.setenv(key, val) + + with ( + patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})), + patch("metrics.upload_artifacts.upload_to_mlflow", return_value=False), + patch("metrics.upload_artifacts.upload_to_blob_fallback") as mock_blob, + ): + main() + + mock_blob.assert_called_once() + call_kwargs = mock_blob.call_args[1] + assert call_kwargs["task"] == "pick_place" + assert call_kwargs["blob_account"] == "acct" + assert call_kwargs["blob_container"] == "ctr" + + def test_env_defaults_passed_correctly(self, tmp_path: Path, monkeypatch) -> None: + mock_set_defaults = MagicMock() + monkeypatch.setitem(sys.modules, "training", MagicMock()) + monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=mock_set_defaults)) + + for key, val in self._env_vars(tmp_path).items(): + monkeypatch.setenv(key, val) + + with ( + patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})), + patch("metrics.upload_artifacts.upload_to_mlflow", return_value=True), + ): + main() + + defaults = mock_set_defaults.call_args[0][0] + assert defaults["TASK"] == "unknown" + assert defaults["EXPORT_DIR"] == "/tmp/exported" + assert defaults["NUM_ENVS"] == "4" + + def test_onnx_and_jit_flags_parsed(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setitem(sys.modules, "training", MagicMock()) + monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=MagicMock())) + + env = self._env_vars(tmp_path) + env["ONNX_SUCCESS"] = "1" + env["JIT_SUCCESS"] = "1" + for key, val in env.items(): + monkeypatch.setenv(key, val) + + with ( + patch("metrics.upload_artifacts.load_metrics", return_value=({"a": 1}, {"b": 2})), + patch("metrics.upload_artifacts.upload_to_mlflow", return_value=True) as mock_mlflow, + ): + main() + + call_kwargs = mock_mlflow.call_args[1] + assert call_kwargs["onnx_success"] is True + assert call_kwargs["jit_success"] is True + assert call_kwargs["onnx_metrics"] == {"a": 1} + assert call_kwargs["jit_metrics"] == {"b": 2} + + def test_optional_env_vars_default_to_empty(self, tmp_path: Path, monkeypatch) -> None: + monkeypatch.setitem(sys.modules, "training", MagicMock()) + monkeypatch.setitem(sys.modules, "training.utils", MagicMock(set_env_defaults=MagicMock())) + + env = self._env_vars(tmp_path) + del env["CHECKPOINT_URI"] + del env["BLOB_STORAGE_ACCOUNT"] + del env["BLOB_CONTAINER"] + del env["METRICS_DIR"] + monkeypatch.delenv("CHECKPOINT_URI", raising=False) + monkeypatch.delenv("BLOB_STORAGE_ACCOUNT", raising=False) + monkeypatch.delenv("BLOB_CONTAINER", raising=False) + monkeypatch.delenv("METRICS_DIR", raising=False) + for key, val in env.items(): + monkeypatch.setenv(key, val) + + with ( + patch("metrics.upload_artifacts.load_metrics", return_value=({}, {})), + patch("metrics.upload_artifacts.upload_to_mlflow", return_value=False), + patch("metrics.upload_artifacts.upload_to_blob_fallback") as mock_blob, + ): + main() + + call_kwargs = mock_blob.call_args[1] + assert call_kwargs["checkpoint_uri"] == "" + assert call_kwargs["blob_account"] == "" + assert call_kwargs["blob_container"] == "" diff --git a/evaluation/uv.lock b/evaluation/uv.lock index 6ef6c0f3..c21f2286 100644 --- a/evaluation/uv.lock +++ b/evaluation/uv.lock @@ -698,6 +698,90 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831, upload-time = "2025-07-26T12:02:51.449Z" }, ] +[[package]] +name = "coverage" +version = "7.13.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967, upload-time = "2026-03-17T10:33:18.341Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554, upload-time = "2026-03-17T10:30:42.208Z" }, + { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908, upload-time = "2026-03-17T10:30:43.906Z" }, + { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419, upload-time = "2026-03-17T10:30:45.545Z" }, + { url = "https://files.pythonhosted.org/packages/8c/49/cd14b789536ac6a4778c453c6a2338bc0a2fb60c5a5a41b4008328b9acc1/coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5", size = 254159, upload-time = "2026-03-17T10:30:47.204Z" }, + { url = "https://files.pythonhosted.org/packages/9d/00/7b0edcfe64e2ed4c0340dac14a52ad0f4c9bd0b8b5e531af7d55b703db7c/coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376", size = 255270, upload-time = "2026-03-17T10:30:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/93/89/7ffc4ba0f5d0a55c1e84ea7cee39c9fc06af7b170513d83fbf3bbefce280/coverage-7.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:012d5319e66e9d5a218834642d6c35d265515a62f01157a45bcc036ecf947256", size = 257538, upload-time = "2026-03-17T10:30:50.77Z" }, + { url = "https://files.pythonhosted.org/packages/81/bd/73ddf85f93f7e6fa83e77ccecb6162d9415c79007b4bc124008a4995e4a7/coverage-7.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8dd02af98971bdb956363e4827d34425cb3df19ee550ef92855b0acb9c7ce51c", size = 251821, upload-time = "2026-03-17T10:30:52.5Z" }, + { url = "https://files.pythonhosted.org/packages/a0/81/278aff4e8dec4926a0bcb9486320752811f543a3ce5b602cc7a29978d073/coverage-7.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f08fd75c50a760c7eb068ae823777268daaf16a80b918fa58eea888f8e3919f5", size = 253191, upload-time = "2026-03-17T10:30:54.543Z" }, + { url = "https://files.pythonhosted.org/packages/70/ee/fe1621488e2e0a58d7e94c4800f0d96f79671553488d401a612bebae324b/coverage-7.13.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:843ea8643cf967d1ac7e8ecd4bb00c99135adf4816c0c0593fdcc47b597fcf09", size = 251337, upload-time = "2026-03-17T10:30:56.663Z" }, + { url = "https://files.pythonhosted.org/packages/37/a6/f79fb37aa104b562207cc23cb5711ab6793608e246cae1e93f26b2236ed9/coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9", size = 255404, upload-time = "2026-03-17T10:30:58.427Z" }, + { url = "https://files.pythonhosted.org/packages/75/f0/ed15262a58ec81ce457ceb717b7f78752a1713556b19081b76e90896e8d4/coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf", size = 250903, upload-time = "2026-03-17T10:31:00.093Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e9/9129958f20e7e9d4d56d51d42ccf708d15cac355ff4ac6e736e97a9393d2/coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c", size = 252780, upload-time = "2026-03-17T10:31:01.916Z" }, + { url = "https://files.pythonhosted.org/packages/a4/d7/0ad9b15812d81272db94379fe4c6df8fd17781cc7671fdfa30c76ba5ff7b/coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf", size = 222093, upload-time = "2026-03-17T10:31:03.642Z" }, + { url = "https://files.pythonhosted.org/packages/29/3d/821a9a5799fac2556bcf0bd37a70d1d11fa9e49784b6d22e92e8b2f85f18/coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810", size = 222900, upload-time = "2026-03-17T10:31:05.651Z" }, + { url = "https://files.pythonhosted.org/packages/d4/fa/2238c2ad08e35cf4f020ea721f717e09ec3152aea75d191a7faf3ef009a8/coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de", size = 221515, upload-time = "2026-03-17T10:31:07.293Z" }, + { url = "https://files.pythonhosted.org/packages/74/8c/74fedc9663dcf168b0a059d4ea756ecae4da77a489048f94b5f512a8d0b3/coverage-7.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ec4af212df513e399cf11610cc27063f1586419e814755ab362e50a85ea69c1", size = 219576, upload-time = "2026-03-17T10:31:09.045Z" }, + { url = "https://files.pythonhosted.org/packages/0c/c9/44fb661c55062f0818a6ffd2685c67aa30816200d5f2817543717d4b92eb/coverage-7.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:941617e518602e2d64942c88ec8499f7fbd49d3f6c4327d3a71d43a1973032f3", size = 219942, upload-time = "2026-03-17T10:31:10.708Z" }, + { url = "https://files.pythonhosted.org/packages/5f/13/93419671cee82b780bab7ea96b67c8ef448f5f295f36bf5031154ec9a790/coverage-7.13.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:da305e9937617ee95c2e39d8ff9f040e0487cbf1ac174f777ed5eddd7a7c1f26", size = 250935, upload-time = "2026-03-17T10:31:12.392Z" }, + { url = "https://files.pythonhosted.org/packages/ac/68/1666e3a4462f8202d836920114fa7a5ee9275d1fa45366d336c551a162dd/coverage-7.13.5-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:78e696e1cc714e57e8b25760b33a8b1026b7048d270140d25dafe1b0a1ee05a3", size = 253541, upload-time = "2026-03-17T10:31:14.247Z" }, + { url = "https://files.pythonhosted.org/packages/4e/5e/3ee3b835647be646dcf3c65a7c6c18f87c27326a858f72ab22c12730773d/coverage-7.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02ca0eed225b2ff301c474aeeeae27d26e2537942aa0f87491d3e147e784a82b", size = 254780, upload-time = "2026-03-17T10:31:16.193Z" }, + { url = "https://files.pythonhosted.org/packages/44/b3/cb5bd1a04cfcc49ede6cd8409d80bee17661167686741e041abc7ee1b9a9/coverage-7.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:04690832cbea4e4663d9149e05dba142546ca05cb1848816760e7f58285c970a", size = 256912, upload-time = "2026-03-17T10:31:17.89Z" }, + { url = "https://files.pythonhosted.org/packages/1b/66/c1dceb7b9714473800b075f5c8a84f4588f887a90eb8645282031676e242/coverage-7.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0590e44dd2745c696a778f7bab6aa95256de2cbc8b8cff4f7db8ff09813d6969", size = 251165, upload-time = "2026-03-17T10:31:19.605Z" }, + { url = "https://files.pythonhosted.org/packages/b7/62/5502b73b97aa2e53ea22a39cf8649ff44827bef76d90bf638777daa27a9d/coverage-7.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d7cfad2d6d81dd298ab6b89fe72c3b7b05ec7544bdda3b707ddaecff8d25c161", size = 252908, upload-time = "2026-03-17T10:31:21.312Z" }, + { url = "https://files.pythonhosted.org/packages/7d/37/7792c2d69854397ca77a55c4646e5897c467928b0e27f2d235d83b5d08c6/coverage-7.13.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e092b9499de38ae0fbfbc603a74660eb6ff3e869e507b50d85a13b6db9863e15", size = 250873, upload-time = "2026-03-17T10:31:23.565Z" }, + { url = "https://files.pythonhosted.org/packages/a3/23/bc866fb6163be52a8a9e5d708ba0d3b1283c12158cefca0a8bbb6e247a43/coverage-7.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:48c39bc4a04d983a54a705a6389512883d4a3b9862991b3617d547940e9f52b1", size = 255030, upload-time = "2026-03-17T10:31:25.58Z" }, + { url = "https://files.pythonhosted.org/packages/7d/8b/ef67e1c222ef49860701d346b8bbb70881bef283bd5f6cbba68a39a086c7/coverage-7.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2d3807015f138ffea1ed9afeeb8624fd781703f2858b62a8dd8da5a0994c57b6", size = 250694, upload-time = "2026-03-17T10:31:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/46/0d/866d1f74f0acddbb906db212e096dee77a8e2158ca5e6bb44729f9d93298/coverage-7.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee2aa19e03161671ec964004fb74b2257805d9710bf14a5c704558b9d8dbaf17", size = 252469, upload-time = "2026-03-17T10:31:29.472Z" }, + { url = "https://files.pythonhosted.org/packages/7a/f5/be742fec31118f02ce42b21c6af187ad6a344fed546b56ca60caacc6a9a0/coverage-7.13.5-cp313-cp313-win32.whl", hash = "sha256:ce1998c0483007608c8382f4ff50164bfc5bd07a2246dd272aa4043b75e61e85", size = 222112, upload-time = "2026-03-17T10:31:31.526Z" }, + { url = "https://files.pythonhosted.org/packages/66/40/7732d648ab9d069a46e686043241f01206348e2bbf128daea85be4d6414b/coverage-7.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:631efb83f01569670a5e866ceb80fe483e7c159fac6f167e6571522636104a0b", size = 222923, upload-time = "2026-03-17T10:31:33.633Z" }, + { url = "https://files.pythonhosted.org/packages/48/af/fea819c12a095781f6ccd504890aaddaf88b8fab263c4940e82c7b770124/coverage-7.13.5-cp313-cp313-win_arm64.whl", hash = "sha256:f4cd16206ad171cbc2470dbea9103cf9a7607d5fe8c242fdf1edf36174020664", size = 221540, upload-time = "2026-03-17T10:31:35.445Z" }, + { url = "https://files.pythonhosted.org/packages/23/d2/17879af479df7fbbd44bd528a31692a48f6b25055d16482fdf5cdb633805/coverage-7.13.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0428cbef5783ad91fe240f673cc1f76b25e74bbfe1a13115e4aa30d3f538162d", size = 220262, upload-time = "2026-03-17T10:31:37.184Z" }, + { url = "https://files.pythonhosted.org/packages/5b/4c/d20e554f988c8f91d6a02c5118f9abbbf73a8768a3048cb4962230d5743f/coverage-7.13.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e0b216a19534b2427cc201a26c25da4a48633f29a487c61258643e89d28200c0", size = 220617, upload-time = "2026-03-17T10:31:39.245Z" }, + { url = "https://files.pythonhosted.org/packages/29/9c/f9f5277b95184f764b24e7231e166dfdb5780a46d408a2ac665969416d61/coverage-7.13.5-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:972a9cd27894afe4bc2b1480107054e062df08e671df7c2f18c205e805ccd806", size = 261912, upload-time = "2026-03-17T10:31:41.324Z" }, + { url = "https://files.pythonhosted.org/packages/d5/f6/7f1ab39393eeb50cfe4747ae8ef0e4fc564b989225aa1152e13a180d74f8/coverage-7.13.5-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4b59148601efcd2bac8c4dbf1f0ad6391693ccf7a74b8205781751637076aee3", size = 263987, upload-time = "2026-03-17T10:31:43.724Z" }, + { url = "https://files.pythonhosted.org/packages/a0/d7/62c084fb489ed9c6fbdf57e006752e7c516ea46fd690e5ed8b8617c7d52e/coverage-7.13.5-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:505d7083c8b0c87a8fa8c07370c285847c1f77739b22e299ad75a6af6c32c5c9", size = 266416, upload-time = "2026-03-17T10:31:45.769Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f6/df63d8660e1a0bff6125947afda112a0502736f470d62ca68b288ea762d8/coverage-7.13.5-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:60365289c3741e4db327e7baff2a4aaacf22f788e80fa4683393891b70a89fbd", size = 267558, upload-time = "2026-03-17T10:31:48.293Z" }, + { url = "https://files.pythonhosted.org/packages/5b/02/353ca81d36779bd108f6d384425f7139ac3c58c750dcfaafe5d0bee6436b/coverage-7.13.5-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1b88c69c8ef5d4b6fe7dea66d6636056a0f6a7527c440e890cf9259011f5e606", size = 261163, upload-time = "2026-03-17T10:31:50.125Z" }, + { url = "https://files.pythonhosted.org/packages/2c/16/2e79106d5749bcaf3aee6d309123548e3276517cd7851faa8da213bc61bf/coverage-7.13.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5b13955d31d1633cf9376908089b7cebe7d15ddad7aeaabcbe969a595a97e95e", size = 263981, upload-time = "2026-03-17T10:31:51.961Z" }, + { url = "https://files.pythonhosted.org/packages/29/c7/c29e0c59ffa6942030ae6f50b88ae49988e7e8da06de7ecdbf49c6d4feae/coverage-7.13.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f70c9ab2595c56f81a89620e22899eea8b212a4041bd728ac6f4a28bf5d3ddd0", size = 261604, upload-time = "2026-03-17T10:31:53.872Z" }, + { url = "https://files.pythonhosted.org/packages/40/48/097cdc3db342f34006a308ab41c3a7c11c3f0d84750d340f45d88a782e00/coverage-7.13.5-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:084b84a8c63e8d6fc7e3931b316a9bcafca1458d753c539db82d31ed20091a87", size = 265321, upload-time = "2026-03-17T10:31:55.997Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/4994af354689e14fd03a75f8ec85a9a68d94e0188bbdab3fc1516b55e512/coverage-7.13.5-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad14385487393e386e2ea988b09d62dd42c397662ac2dabc3832d71253eee479", size = 260502, upload-time = "2026-03-17T10:31:58.308Z" }, + { url = "https://files.pythonhosted.org/packages/22/c6/9bb9ef55903e628033560885f5c31aa227e46878118b63ab15dc7ba87797/coverage-7.13.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f2c47b36fe7709a6e83bfadf4eefb90bd25fbe4014d715224c4316f808e59a2", size = 262688, upload-time = "2026-03-17T10:32:00.141Z" }, + { url = "https://files.pythonhosted.org/packages/14/4f/f5df9007e50b15e53e01edea486814783a7f019893733d9e4d6caad75557/coverage-7.13.5-cp313-cp313t-win32.whl", hash = "sha256:67e9bc5449801fad0e5dff329499fb090ba4c5800b86805c80617b4e29809b2a", size = 222788, upload-time = "2026-03-17T10:32:02.246Z" }, + { url = "https://files.pythonhosted.org/packages/e1/98/aa7fccaa97d0f3192bec013c4e6fd6d294a6ed44b640e6bb61f479e00ed5/coverage-7.13.5-cp313-cp313t-win_amd64.whl", hash = "sha256:da86cdcf10d2519e10cabb8ac2de03da1bcb6e4853790b7fbd48523332e3a819", size = 223851, upload-time = "2026-03-17T10:32:04.416Z" }, + { url = "https://files.pythonhosted.org/packages/3d/8b/e5c469f7352651e5f013198e9e21f97510b23de957dd06a84071683b4b60/coverage-7.13.5-cp313-cp313t-win_arm64.whl", hash = "sha256:0ecf12ecb326fe2c339d93fc131816f3a7367d223db37817208905c89bded911", size = 222104, upload-time = "2026-03-17T10:32:06.65Z" }, + { url = "https://files.pythonhosted.org/packages/8e/77/39703f0d1d4b478bfd30191d3c14f53caf596fac00efb3f8f6ee23646439/coverage-7.13.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fbabfaceaeb587e16f7008f7795cd80d20ec548dc7f94fbb0d4ec2e038ce563f", size = 219621, upload-time = "2026-03-17T10:32:08.589Z" }, + { url = "https://files.pythonhosted.org/packages/e2/3e/51dff36d99ae14639a133d9b164d63e628532e2974d8b1edb99dd1ebc733/coverage-7.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9bb2a28101a443669a423b665939381084412b81c3f8c0fcfbac57f4e30b5b8e", size = 219953, upload-time = "2026-03-17T10:32:10.507Z" }, + { url = "https://files.pythonhosted.org/packages/6a/6c/1f1917b01eb647c2f2adc9962bd66c79eb978951cab61bdc1acab3290c07/coverage-7.13.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bd3a2fbc1c6cccb3c5106140d87cc6a8715110373ef42b63cf5aea29df8c217a", size = 250992, upload-time = "2026-03-17T10:32:12.41Z" }, + { url = "https://files.pythonhosted.org/packages/22/e5/06b1f88f42a5a99df42ce61208bdec3bddb3d261412874280a19796fc09c/coverage-7.13.5-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6c36ddb64ed9d7e496028d1d00dfec3e428e0aabf4006583bb1839958d280510", size = 253503, upload-time = "2026-03-17T10:32:14.449Z" }, + { url = "https://files.pythonhosted.org/packages/80/28/2a148a51e5907e504fa7b85490277734e6771d8844ebcc48764a15e28155/coverage-7.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:380e8e9084d8eb38db3a9176a1a4f3c0082c3806fa0dc882d1d87abc3c789247", size = 254852, upload-time = "2026-03-17T10:32:16.56Z" }, + { url = "https://files.pythonhosted.org/packages/61/77/50e8d3d85cc0b7ebe09f30f151d670e302c7ff4a1bf6243f71dd8b0981fa/coverage-7.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e808af52a0513762df4d945ea164a24b37f2f518cbe97e03deaa0ee66139b4d6", size = 257161, upload-time = "2026-03-17T10:32:19.004Z" }, + { url = "https://files.pythonhosted.org/packages/3b/c4/b5fd1d4b7bf8d0e75d997afd3925c59ba629fc8616f1b3aae7605132e256/coverage-7.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e301d30dd7e95ae068671d746ba8c34e945a82682e62918e41b2679acd2051a0", size = 251021, upload-time = "2026-03-17T10:32:21.344Z" }, + { url = "https://files.pythonhosted.org/packages/f8/66/6ea21f910e92d69ef0b1c3346ea5922a51bad4446c9126db2ae96ee24c4c/coverage-7.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:800bc829053c80d240a687ceeb927a94fd108bbdc68dfbe505d0d75ab578a882", size = 252858, upload-time = "2026-03-17T10:32:23.506Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ea/879c83cb5d61aa2a35fb80e72715e92672daef8191b84911a643f533840c/coverage-7.13.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:0b67af5492adb31940ee418a5a655c28e48165da5afab8c7fa6fd72a142f8740", size = 250823, upload-time = "2026-03-17T10:32:25.516Z" }, + { url = "https://files.pythonhosted.org/packages/8a/fb/616d95d3adb88b9803b275580bdeee8bd1b69a886d057652521f83d7322f/coverage-7.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c9136ff29c3a91e25b1d1552b5308e53a1e0653a23e53b6366d7c2dcbbaf8a16", size = 255099, upload-time = "2026-03-17T10:32:27.944Z" }, + { url = "https://files.pythonhosted.org/packages/1c/93/25e6917c90ec1c9a56b0b26f6cad6408e5f13bb6b35d484a0d75c9cf000d/coverage-7.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:cff784eef7f0b8f6cb28804fbddcfa99f89efe4cc35fb5627e3ac58f91ed3ac0", size = 250638, upload-time = "2026-03-17T10:32:29.914Z" }, + { url = "https://files.pythonhosted.org/packages/fc/7b/dc1776b0464145a929deed214aef9fb1493f159b59ff3c7eeeedf91eddd0/coverage-7.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:68a4953be99b17ac3c23b6efbc8a38330d99680c9458927491d18700ef23ded0", size = 252295, upload-time = "2026-03-17T10:32:31.981Z" }, + { url = "https://files.pythonhosted.org/packages/ea/fb/99cbbc56a26e07762a2740713f3c8f9f3f3106e3a3dd8cc4474954bccd34/coverage-7.13.5-cp314-cp314-win32.whl", hash = "sha256:35a31f2b1578185fbe6aa2e74cea1b1d0bbf4c552774247d9160d29b80ed56cc", size = 222360, upload-time = "2026-03-17T10:32:34.233Z" }, + { url = "https://files.pythonhosted.org/packages/8d/b7/4758d4f73fb536347cc5e4ad63662f9d60ba9118cb6785e9616b2ce5d7fa/coverage-7.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:2aa055ae1857258f9e0045be26a6d62bdb47a72448b62d7b55f4820f361a2633", size = 223174, upload-time = "2026-03-17T10:32:36.369Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f2/24d84e1dfe70f8ac9fdf30d338239860d0d1d5da0bda528959d0ebc9da28/coverage-7.13.5-cp314-cp314-win_arm64.whl", hash = "sha256:1b11eef33edeae9d142f9b4358edb76273b3bfd30bc3df9a4f95d0e49caf94e8", size = 221739, upload-time = "2026-03-17T10:32:38.736Z" }, + { url = "https://files.pythonhosted.org/packages/60/5b/4a168591057b3668c2428bff25dd3ebc21b629d666d90bcdfa0217940e84/coverage-7.13.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10a0c37f0b646eaff7cce1874c31d1f1ccb297688d4c747291f4f4c70741cc8b", size = 220351, upload-time = "2026-03-17T10:32:41.196Z" }, + { url = "https://files.pythonhosted.org/packages/f5/21/1fd5c4dbfe4a58b6b99649125635df46decdfd4a784c3cd6d410d303e370/coverage-7.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b5db73ba3c41c7008037fa731ad5459fc3944cb7452fc0aa9f822ad3533c583c", size = 220612, upload-time = "2026-03-17T10:32:43.204Z" }, + { url = "https://files.pythonhosted.org/packages/d6/fe/2a924b3055a5e7e4512655a9d4609781b0d62334fa0140c3e742926834e2/coverage-7.13.5-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:750db93a81e3e5a9831b534be7b1229df848b2e125a604fe6651e48aa070e5f9", size = 261985, upload-time = "2026-03-17T10:32:45.514Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0d/c8928f2bd518c45990fe1a2ab8db42e914ef9b726c975facc4282578c3eb/coverage-7.13.5-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9ddb4f4a5479f2539644be484da179b653273bca1a323947d48ab107b3ed1f29", size = 264107, upload-time = "2026-03-17T10:32:47.971Z" }, + { url = "https://files.pythonhosted.org/packages/ef/ae/4ae35bbd9a0af9d820362751f0766582833c211224b38665c0f8de3d487f/coverage-7.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8a7a2049c14f413163e2bdabd37e41179b1d1ccb10ffc6ccc4b7a718429c607", size = 266513, upload-time = "2026-03-17T10:32:50.1Z" }, + { url = "https://files.pythonhosted.org/packages/9c/20/d326174c55af36f74eac6ae781612d9492f060ce8244b570bb9d50d9d609/coverage-7.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1c85e0b6c05c592ea6d8768a66a254bfb3874b53774b12d4c89c481eb78cb90", size = 267650, upload-time = "2026-03-17T10:32:52.391Z" }, + { url = "https://files.pythonhosted.org/packages/7a/5e/31484d62cbd0eabd3412e30d74386ece4a0837d4f6c3040a653878bfc019/coverage-7.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:777c4d1eff1b67876139d24288aaf1817f6c03d6bae9c5cc8d27b83bcfe38fe3", size = 261089, upload-time = "2026-03-17T10:32:54.544Z" }, + { url = "https://files.pythonhosted.org/packages/e9/d8/49a72d6de146eebb0b7e48cc0f4bc2c0dd858e3d4790ab2b39a2872b62bd/coverage-7.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6697e29b93707167687543480a40f0db8f356e86d9f67ddf2e37e2dfd91a9dab", size = 263982, upload-time = "2026-03-17T10:32:56.803Z" }, + { url = "https://files.pythonhosted.org/packages/06/3b/0351f1bd566e6e4dd39e978efe7958bde1d32f879e85589de147654f57bb/coverage-7.13.5-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:8fdf453a942c3e4d99bd80088141c4c6960bb232c409d9c3558e2dbaa3998562", size = 261579, upload-time = "2026-03-17T10:32:59.466Z" }, + { url = "https://files.pythonhosted.org/packages/5d/ce/796a2a2f4017f554d7810f5c573449b35b1e46788424a548d4d19201b222/coverage-7.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:32ca0c0114c9834a43f045a87dcebd69d108d8ffb666957ea65aa132f50332e2", size = 265316, upload-time = "2026-03-17T10:33:01.847Z" }, + { url = "https://files.pythonhosted.org/packages/3d/16/d5ae91455541d1a78bc90abf495be600588aff8f6db5c8b0dae739fa39c9/coverage-7.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:8769751c10f339021e2638cd354e13adeac54004d1941119b2c96fe5276d45ea", size = 260427, upload-time = "2026-03-17T10:33:03.945Z" }, + { url = "https://files.pythonhosted.org/packages/48/11/07f413dba62db21fb3fad5d0de013a50e073cc4e2dc4306e770360f6dfc8/coverage-7.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cec2d83125531bd153175354055cdb7a09987af08a9430bd173c937c6d0fba2a", size = 262745, upload-time = "2026-03-17T10:33:06.285Z" }, + { url = "https://files.pythonhosted.org/packages/91/15/d792371332eb4663115becf4bad47e047d16234b1aff687b1b18c58d60ae/coverage-7.13.5-cp314-cp314t-win32.whl", hash = "sha256:0cd9ed7a8b181775459296e402ca4fb27db1279740a24e93b3b41942ebe4b215", size = 223146, upload-time = "2026-03-17T10:33:08.756Z" }, + { url = "https://files.pythonhosted.org/packages/db/51/37221f59a111dca5e85be7dbf09696323b5b9f13ff65e0641d535ed06ea8/coverage-7.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:301e3b7dfefecaca37c9f1aa6f0049b7d4ab8dd933742b607765d757aca77d43", size = 224254, upload-time = "2026-03-17T10:33:11.174Z" }, + { url = "https://files.pythonhosted.org/packages/54/83/6acacc889de8987441aa7d5adfbdbf33d288dad28704a67e574f1df9bcbb/coverage-7.13.5-cp314-cp314t-win_arm64.whl", hash = "sha256:9dacc2ad679b292709e0f5fc1ac74a6d4d5562e424058962c7bb0c658ad25e45", size = 222276, upload-time = "2026-03-17T10:33:13.466Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346, upload-time = "2026-03-17T10:33:15.691Z" }, +] + [[package]] name = "cryptography" version = "46.0.6" @@ -1360,6 +1444,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5e/c9/4c1e1216b24bcab140c83acdf8bc89a846ea17cd8a06cd18e3fd308a297f/huggingface_hub-1.10.2-py3-none-any.whl", hash = "sha256:c26c908767cc711493978dc0b4f5747ba7841602997cc98bfd628450a28cf9bc", size = 642581, upload-time = "2026-04-14T10:42:26.563Z" }, ] +[[package]] +name = "hypothesis" +version = "6.151.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sortedcontainers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/39/da/d9cc191b2fd31a138e9e803c967ec59496e991290d1c986cb74963e577d0/hypothesis-6.151.13.tar.gz", hash = "sha256:ca85e59454d7f36276a7ee99c775acd95e56495d4028b01e5b606a316771890c", size = 463886, upload-time = "2026-04-13T06:32:48.382Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/4d/06c2149d3aa1a0877db55f5dabb0070e046ac0a4b3795397d7c6477e0789/hypothesis-6.151.13-py3-none-any.whl", hash = "sha256:642508683cd59f2b0cd049bbee5029a61104f69621e2652bd2a894221ee424a9", size = 529610, upload-time = "2026-04-13T06:32:46.83Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -1414,6 +1510,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "isodate" version = "0.7.2" @@ -2777,6 +2882,17 @@ dependencies = [ { name = "torch" }, ] +[package.dev-dependencies] +dev = [ + { name = "hypothesis" }, + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pytest" }, + { name = "pytest-cov" }, + { name = "pytest-mock" }, + { name = "torch" }, +] + [package.metadata] requires-dist = [ { name = "azure-ai-ml", specifier = "==1.32.0" }, @@ -2800,6 +2916,17 @@ requires-dist = [ { name = "torch", specifier = "==2.10.0" }, ] +[package.metadata.requires-dev] +dev = [ + { name = "hypothesis", specifier = "==6.151.13" }, + { name = "matplotlib", specifier = "==3.10.8" }, + { name = "numpy", specifier = "==2.2.6" }, + { name = "pytest", specifier = "==9.0.3" }, + { name = "pytest-cov", specifier = "==7.1.0" }, + { name = "pytest-mock", specifier = "==3.15.1" }, + { name = "torch", specifier = "==2.10.0" }, +] + [[package]] name = "pillow" version = "12.2.0" @@ -2878,6 +3005,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" }, ] +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + [[package]] name = "prettytable" version = "3.17.0" @@ -3349,6 +3485,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/07/bc/587a445451b253b285629263eb51c2d8e9bcea4fc97826266d186f96f558/pyserial-3.5-py2.py3-none-any.whl", hash = "sha256:c4451db6ba391ca6ca299fb3ec7bae67a5c55dde170964c7a14ceefec02f2cf0", size = 90585, upload-time = "2020-11-23T03:59:13.41Z" }, ] +[[package]] +name = "pytest" +version = "9.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, +] + +[[package]] +name = "pytest-cov" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coverage" }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" }, +] + +[[package]] +name = "pytest-mock" +version = "3.15.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/14/eb014d26be205d38ad5ad20d9a80f7d201472e08167f0bb4361e251084a9/pytest_mock-3.15.1.tar.gz", hash = "sha256:1849a238f6f396da19762269de72cb1814ab44416fa73a8686deac10b0d87a0f", size = 34036, upload-time = "2025-09-16T16:37:27.081Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/cc/06253936f4a7fa2e0f48dfe6d851d9c56df896a9ab09ac019d70b760619c/pytest_mock-3.15.1-py3-none-any.whl", hash = "sha256:0a25e2eb88fe5168d535041d09a4529a188176ae608a6d249ee65abc0949630d", size = 10095, upload-time = "2025-09-16T16:37:25.734Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3909,6 +4087,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/d4/59e74daffcb57a07668852eeeb6035af9f32cbfd7a1d2511f17d2fe6a738/smmap-5.0.3-py3-none-any.whl", hash = "sha256:c106e05d5a61449cf6ba9a1e650227ecfb141590d2a98412103ff35d89fc7b2f", size = 24390, upload-time = "2026-03-09T03:43:24.361Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.49" diff --git a/package.json b/package.json index 10808e9c..266bedca 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "markdownlint-cli2": "0.22.0" }, "overrides": { - "smol-toml": ">=1.6.1" + "smol-toml": "1.6.1" }, "repository": { "type": "git",