diff --git a/.cspell.json b/.cspell.json index 5194bfc3..043415eb 100644 --- a/.cspell.json +++ b/.cspell.json @@ -28,7 +28,8 @@ "*megalinter_file_names_cspell.txt", "**/.terraform/**", "**/.terraform.lock.hcl", - "**/shared/ci/tests/Fixtures/**" + "**/shared/ci/tests/Fixtures/**", + "**/TERRAFORM.md" ], "dictionaryDefinitions": [ { diff --git a/.cspell/general-technical.txt b/.cspell/general-technical.txt index 51b0c3a1..cb2bfaa2 100644 --- a/.cspell/general-technical.txt +++ b/.cspell/general-technical.txt @@ -676,6 +676,7 @@ lakehouses lalogs lan lanczos +lastexitcode lavfi ldap leaderboard diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index 3ae1ffe0..f0c6dca2 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -7,7 +7,8 @@ "**/.venv/**", "external/**", "shared/ci/tests/Fixtures/**", - "logs/**" + "logs/**", + "**/TERRAFORM.md" ], "config": { "default": true, diff --git a/README.md b/README.md index 0f1b7046..47c61d56 100644 --- a/README.md +++ b/README.md @@ -82,14 +82,14 @@ The setup script installs Python 3.11 via [uv](https://docs.astral.sh/uv/), crea Full documentation is available in the [docs/](docs/README.md) directory. -| Guide | Description | -|---------------------------------------------------|-----------------------------------------------------------------| -| [Getting Started](docs/getting-started/README.md) | Prerequisites, quickstart, and first training job | -| [Deployment](docs/infrastructure/README.md) | Infrastructure provisioning and setup | -| [Training](docs/training/README.md) | RL and IL training workflows, MLflow, and checkpointing | -| [Security](docs/security/README.md) | Threat model, security guide, deployment responsibilities | -| [Recipes](docs/recipes/README.md) | Guides that take you from a standing start to a working result | -| [Contributing](docs/contributing/README.md) | Architecture, style guides, contribution workflow | +| Guide | Description | +|---------------------------------------------------|----------------------------------------------------------------| +| [Getting Started](docs/getting-started/README.md) | Prerequisites, quickstart, and first training job | +| [Deployment](docs/infrastructure/README.md) | Infrastructure provisioning and setup | +| [Training](docs/training/README.md) | RL and IL training workflows, MLflow, and checkpointing | +| [Security](docs/security/README.md) | Threat model, security guide, deployment responsibilities | +| [Recipes](docs/recipes/README.md) | Guides that take you from a standing start to a working result | +| [Contributing](docs/contributing/README.md) | Architecture, style guides, contribution workflow | ## Architecture diff --git a/docs/contributing/infrastructure-style.md b/docs/contributing/infrastructure-style.md index 2c5657a7..20132e3b 100644 --- a/docs/contributing/infrastructure-style.md +++ b/docs/contributing/infrastructure-style.md @@ -3,7 +3,7 @@ sidebar_position: 7 title: Infrastructure as Code Style Guide description: Terraform conventions, shell script standards, and copyright headers for contributions author: Microsoft Robotics-AI Team -ms.date: 2026-03-18 +ms.date: 2026-03-26 ms.topic: reference --- @@ -343,6 +343,48 @@ kind: ConfigMap * Place at the top of the file for other file types * Include blank line between copyright header and code +## Documentation Generation + +Terraform module documentation generates from source using [terraform-docs](https://terraform-docs.io/) v0.21.0. Each module and deployment directory contains a `TERRAFORM.md` file that terraform-docs produces automatically. + +### Configuration + +The repository-wide configuration lives in `.terraform-docs.yml` at the workspace root. This file controls output format, section ordering, and content templates. + +### Generated Files + +Generated `TERRAFORM.md` files exist in every Terraform module and deployment directory. These files are excluded from cspell and markdownlint because their content derives from HCL source code. + +| Directory | File | +|----------------------------------------------|----------------| +| `infrastructure/terraform/` | `TERRAFORM.md` | +| `infrastructure/terraform/vpn/` | `TERRAFORM.md` | +| `infrastructure/terraform/modules/platform/` | `TERRAFORM.md` | +| `infrastructure/terraform/modules/sil/` | `TERRAFORM.md` | +| `infrastructure/terraform/modules/vpn/` | `TERRAFORM.md` | + +### Regenerating Documentation + +Run terraform-docs against a specific directory: + +```bash +terraform-docs markdown table --output-file TERRAFORM.md infrastructure/terraform/modules/platform/ +``` + +Or regenerate all modules using the PowerShell helper: + +```powershell +./scripts/Update-TerraformDocs.ps1 +``` + +### Quality Standards + +Variable descriptions serve as the primary documentation source. Write descriptions that: + +* Use sentence case without trailing periods +* Explain purpose and expected values, not just the variable name restated +* Include examples for complex types (e.g., `object`, `map`) + ## Related Documentation * [Contributing Guide](README.md) - Prerequisites, workflow, commit messages diff --git a/docs/contributing/prerequisites.md b/docs/contributing/prerequisites.md index e1752c76..ff76063d 100644 --- a/docs/contributing/prerequisites.md +++ b/docs/contributing/prerequisites.md @@ -22,22 +22,23 @@ Tools, Azure access, and build validation requirements for contributing to the P Install these tools before contributing: -| Tool | Minimum Version | Installation | -|-------------|-----------------|-----------------------------------------------------------------------| -| Terraform | 1.9.8 | | -| TFLint | 0.61.0 | | -| Azure CLI | 2.65.0 | | -| kubectl | 1.31 | | -| Helm | 3.16 | | -| Node.js/npm | 20+ LTS | | -| Python | 3.11+ | | -| shellcheck | 0.10+ | | -| uv | latest | | -| Go | 1.24+ | | -| golangci-lint | 2.11+ | | -| Docker | latest | (with NVIDIA Container Toolkit) | -| OSMO CLI | latest | | -| hve-core | latest | | +| Tool | Minimum Version | Installation | +|----------------|-----------------|-----------------------------------------------------------------------| +| Terraform | 1.9.8 | | +| TFLint | 0.61.0 | | +| Azure CLI | 2.65.0 | | +| kubectl | 1.31 | | +| Helm | 3.16 | | +| Node.js/npm | 20+ LTS | | +| Python | 3.11+ | | +| shellcheck | 0.10+ | | +| uv | latest | | +| Go | 1.24+ | | +| golangci-lint | 2.11+ | | +| Docker | latest | (with NVIDIA Container Toolkit) | +| OSMO CLI | latest | | +| terraform-docs | 0.21.0 | | +| hve-core | latest | | ## Azure Access Requirements @@ -145,6 +146,9 @@ nvidia-ctk --version # OSMO CLI osmo --version +# terraform-docs +terraform-docs --version # >= 0.21.0 + # hve-core (VS Code extension — verify via extensions list) code --list-extensions | grep -i hve-core ``` diff --git a/docs/operations/README.md b/docs/operations/README.md index 6d76abc1..e910eefb 100644 --- a/docs/operations/README.md +++ b/docs/operations/README.md @@ -31,16 +31,16 @@ Centralized hub for operational documentation covering monitoring, troubleshooti The reference architecture deploys configurable monitoring components through Terraform feature flags. -| Component | Purpose | Feature Flag | -|----------------------------------|------------------------------------|-----------------------------------| -| Log Analytics workspace | Central log aggregation | Always deployed | -| Application Insights | Application performance monitoring | Always deployed | -| Azure Monitor workspace | Prometheus metrics backend | `should_deploy_monitor_workspace` | -| Managed Grafana | Visualization dashboards | `should_deploy_grafana` | -| Container Insights | AKS container telemetry | `should_deploy_dce` | +| Component | Purpose | Feature Flag | +|----------------------------------|------------------------------------|-----------------------------------------------------------| +| Log Analytics workspace | Central log aggregation | Always deployed | +| Application Insights | Application performance monitoring | Always deployed | +| Azure Monitor workspace | Prometheus metrics backend | `should_deploy_monitor_workspace` | +| Managed Grafana | Visualization dashboards | `should_deploy_grafana` | +| Container Insights | AKS container telemetry | `should_deploy_dce` | | Prometheus data collection rules | Metric scraping configuration | `should_deploy_dce` and `should_deploy_monitor_workspace` | -| Azure Monitor Private Link Scope | Private network monitoring | `should_deploy_ampls` | -| Data collection endpoint | Private ingestion endpoint | `should_deploy_dce` | +| Azure Monitor Private Link Scope | Private network monitoring | `should_deploy_ampls` | +| Data collection endpoint | Private ingestion endpoint | `should_deploy_dce` | > [!IMPORTANT] > The default configuration deploys a **private AKS cluster**. Connect through the VPN Gateway before running any `kubectl` or Helm commands. See [VPN Gateway](../infrastructure/vpn.md) for setup instructions. diff --git a/docs/recipes/README.md b/docs/recipes/README.md index 9e6526a6..6c0ed60d 100644 --- a/docs/recipes/README.md +++ b/docs/recipes/README.md @@ -7,30 +7,30 @@ Step-by-step guides that take you from a standing start to a working result. Eac ## 🚀 Pick a Recipe -| Goal | Recipe | Time | -|------|--------|------| -| Train an RL policy | [Your First RL Training Job](training/your-first-rl-training-job.md) | 30 min | -| Train a LeRobot policy | [Your First LeRobot Training Job](training/your-first-lerobot-training-job.md) | 30 min | -| Run the full train → eval → register pipeline | [End-to-End LeRobot Pipeline](training/end-to-end-lerobot-pipeline.md) | 60 min | -| Configure edge recording | [Configuring Edge Data Recording](data-collection/configuring-edge-data-recording.md) | 20 min | -| Prepare a dataset for training | [Preparing Datasets for Training](data-collection/preparing-datasets-for-training.md) | 30 min | +| Goal | Recipe | Time | +|-----------------------------------------------|---------------------------------------------------------------------------------------|--------| +| Train an RL policy | [Your First RL Training Job](training/your-first-rl-training-job.md) | 30 min | +| Train a LeRobot policy | [Your First LeRobot Training Job](training/your-first-lerobot-training-job.md) | 30 min | +| Run the full train → eval → register pipeline | [End-to-End LeRobot Pipeline](training/end-to-end-lerobot-pipeline.md) | 60 min | +| Configure edge recording | [Configuring Edge Data Recording](data-collection/configuring-edge-data-recording.md) | 20 min | +| Prepare a dataset for training | [Preparing Datasets for Training](data-collection/preparing-datasets-for-training.md) | 30 min | ## 📖 Recipe Catalog ### Training -| Recipe | Description | Prerequisites | -|--------|-------------|---------------| -| [Your First RL Training Job](training/your-first-rl-training-job.md) | Submit an Isaac Lab RL training job on OSMO with SKRL | Deployed infrastructure, OSMO running | -| [Your First LeRobot Training Job](training/your-first-lerobot-training-job.md) | Submit a LeRobot behavioral cloning job on OSMO | Deployed infrastructure, HuggingFace dataset | -| [End-to-End LeRobot Pipeline](training/end-to-end-lerobot-pipeline.md) | Orchestrate train → evaluate → register in one command | Completed basic LeRobot recipe | +| Recipe | Description | Prerequisites | +|--------------------------------------------------------------------------------|--------------------------------------------------------|----------------------------------------------| +| [Your First RL Training Job](training/your-first-rl-training-job.md) | Submit an Isaac Lab RL training job on OSMO with SKRL | Deployed infrastructure, OSMO running | +| [Your First LeRobot Training Job](training/your-first-lerobot-training-job.md) | Submit a LeRobot behavioral cloning job on OSMO | Deployed infrastructure, HuggingFace dataset | +| [End-to-End LeRobot Pipeline](training/end-to-end-lerobot-pipeline.md) | Orchestrate train → evaluate → register in one command | Completed basic LeRobot recipe | ### Data Collection -| Recipe | Description | Prerequisites | -|--------|-------------|---------------| -| [Configuring Edge Data Recording](data-collection/configuring-edge-data-recording.md) | Set up ROS 2 edge recording on Jetson with chunking and compression | Jetson device, ROS 2 | -| [Preparing Datasets for Training](data-collection/preparing-datasets-for-training.md) | Download, inspect, and validate datasets for LeRobot training | Python 3.11+, Azure CLI | +| Recipe | Description | Prerequisites | +|---------------------------------------------------------------------------------------|---------------------------------------------------------------------|-------------------------| +| [Configuring Edge Data Recording](data-collection/configuring-edge-data-recording.md) | Set up ROS 2 edge recording on Jetson with chunking and compression | Jetson device, ROS 2 | +| [Preparing Datasets for Training](data-collection/preparing-datasets-for-training.md) | Download, inspect, and validate datasets for LeRobot training | Python 3.11+, Azure CLI | ## 🔗 Related Documentation diff --git a/docs/recipes/data-collection/README.md b/docs/recipes/data-collection/README.md index fbb56bfb..796efbc4 100644 --- a/docs/recipes/data-collection/README.md +++ b/docs/recipes/data-collection/README.md @@ -4,10 +4,10 @@ Guides for capturing, processing, and managing robotic training datasets. ## 📖 Recipes -| Recipe | Description | Time | -|--------|-------------|------| +| Recipe | Description | Time | +|-----------------------------------------------------------------------|---------------------------------------------------------------------|--------| | [Configuring Edge Data Recording](configuring-edge-data-recording.md) | Set up ROS 2 edge recording on Jetson with chunking and compression | 20 min | -| [Preparing Datasets for Training](preparing-datasets-for-training.md) | Download, inspect, and validate datasets for LeRobot training | 30 min | +| [Preparing Datasets for Training](preparing-datasets-for-training.md) | Download, inspect, and validate datasets for LeRobot training | 30 min | ## 🔗 Related diff --git a/docs/recipes/data-collection/configuring-edge-data-recording.md b/docs/recipes/data-collection/configuring-edge-data-recording.md index 44e37ca8..e58c31ba 100644 --- a/docs/recipes/data-collection/configuring-edge-data-recording.md +++ b/docs/recipes/data-collection/configuring-edge-data-recording.md @@ -7,12 +7,12 @@ Create a recording configuration for ROS 2 edge data capture on NVIDIA Jetson de ## 📋 Prerequisites -| Requirement | Details | -|-------------|---------| -| NVIDIA Jetson | JetPack 6.0+ installed | -| ROS 2 | Humble or later with `rosbag2` packages | -| Storage | Sufficient disk space for recording sessions (SSD recommended) | -| IDE | VS Code or any editor with YAML support (optional: JSON Schema validation) | +| Requirement | Details | +|---------------|----------------------------------------------------------------------------| +| NVIDIA Jetson | JetPack 6.0+ installed | +| ROS 2 | Humble or later with `rosbag2` packages | +| Storage | Sufficient disk space for recording sessions (SSD recommended) | +| IDE | VS Code or any editor with YAML support (optional: JSON Schema validation) | ## 🚀 Steps @@ -49,11 +49,11 @@ topics: Choose compression based on data characteristics: -| Algorithm | Ratio | CPU Overhead | Best For | -|-----------|-------|--------------|----------| -| `none` | 1x | 0% | Debugging, maximum write speed | -| `lz4` | 2-3x | <10% | High-frequency numeric data (joints, IMU) | -| `zstd` | 3-5x | 20-30% | Images and low-frequency data | +| Algorithm | Ratio | CPU Overhead | Best For | +|-----------|-------|--------------|-------------------------------------------| +| `none` | 1x | 0% | Debugging, maximum write speed | +| `lz4` | 2-3x | <10% | High-frequency numeric data (joints, IMU) | +| `zstd` | 3-5x | 20-30% | Images and low-frequency data | ### Step 3: Configure episode triggers @@ -144,16 +144,16 @@ A successful validation prints the topic count and trigger type without errors. ## ⚙️ Configuration Reference -| Section | Field | Type | Required | Description | -|---------|-------|------|----------|-------------| -| `topics[]` | `name` | string | yes | ROS 2 topic path starting with `/` | -| `topics[]` | `frequency_hz` | float | yes | Target recording frequency (0, 1000] | -| `topics[]` | `compression` | string | no | `none`, `lz4`, or `zstd` (default: `none`) | -| `trigger` | `type` | string | yes | `gpio`, `service`, or `timer` | -| `disk_thresholds` | `warning_percent` | int | no | Disk usage warning threshold | -| `disk_thresholds` | `critical_percent` | int | no | Disk usage stop threshold | -| `gap_detection` | `threshold_ms` | float | no | Missing message detection threshold | -| `output_dir` | — | string | no | Recording output directory | +| Section | Field | Type | Required | Description | +|-------------------|--------------------|--------|----------|--------------------------------------------| +| `topics[]` | `name` | string | yes | ROS 2 topic path starting with `/` | +| `topics[]` | `frequency_hz` | float | yes | Target recording frequency (0, 1000] | +| `topics[]` | `compression` | string | no | `none`, `lz4`, or `zstd` (default: `none`) | +| `trigger` | `type` | string | yes | `gpio`, `service`, or `timer` | +| `disk_thresholds` | `warning_percent` | int | no | Disk usage warning threshold | +| `disk_thresholds` | `critical_percent` | int | no | Disk usage stop threshold | +| `gap_detection` | `threshold_ms` | float | no | Missing message detection threshold | +| `output_dir` | — | string | no | Recording output directory | See [Chunking and Compression Configuration](../../data-pipeline/chunking-compression-config.md) for advanced bag splitting options. diff --git a/docs/recipes/data-collection/preparing-datasets-for-training.md b/docs/recipes/data-collection/preparing-datasets-for-training.md index b0ed79df..99764bf3 100644 --- a/docs/recipes/data-collection/preparing-datasets-for-training.md +++ b/docs/recipes/data-collection/preparing-datasets-for-training.md @@ -7,12 +7,12 @@ Download a dataset from Azure Blob Storage or HuggingFace, inspect its structure ## 📋 Prerequisites -| Requirement | Details | -|-------------|---------| -| Python | 3.11+ with `uv` or `pip` | -| Azure CLI | Authenticated (`az login`) — for Azure Blob datasets | -| Azure Storage | Storage account with dataset container — for Azure Blob datasets | -| HuggingFace CLI | `pip install huggingface-hub` — for HuggingFace datasets | +| Requirement | Details | +|-----------------|------------------------------------------------------------------| +| Python | 3.11+ with `uv` or `pip` | +| Azure CLI | Authenticated (`az login`) — for Azure Blob datasets | +| Azure Storage | Storage account with dataset container — for Azure Blob datasets | +| HuggingFace CLI | `pip install huggingface-hub` — for HuggingFace datasets | ## 🚀 Steps @@ -20,10 +20,10 @@ Download a dataset from Azure Blob Storage or HuggingFace, inspect its structure LeRobot datasets come from two sources: -| Source | When to use | Example | -|--------|-------------|---------| -| HuggingFace Hub | Public community datasets, quick experimentation | `lerobot/aloha_sim_insertion_human` | -| Azure Blob Storage | Private datasets, recorded edge data uploaded to Azure | Custom organization datasets | +| Source | When to use | Example | +|--------------------|--------------------------------------------------------|-------------------------------------| +| HuggingFace Hub | Public community datasets, quick experimentation | `lerobot/aloha_sim_insertion_human` | +| Azure Blob Storage | Private datasets, recorded edge data uploaded to Azure | Custom organization datasets | ### Step 2a: Download from HuggingFace @@ -161,13 +161,13 @@ The recipe succeeded when: `download_dataset.py` environment variables: -| Variable | Required | Default | Description | -|----------|----------|---------|-------------| -| `STORAGE_ACCOUNT` | yes | — | Azure Storage account name | -| `STORAGE_CONTAINER` | no | `datasets` | Blob container name | -| `BLOB_PREFIX` | yes | — | Blob path prefix for dataset files | -| `DATASET_ROOT` | no | `/workspace/data` | Local root directory for datasets | -| `DATASET_REPO_ID` | yes | — | Dataset identifier (e.g., `user/dataset`) | +| Variable | Required | Default | Description | +|---------------------|----------|-------------------|-------------------------------------------| +| `STORAGE_ACCOUNT` | yes | — | Azure Storage account name | +| `STORAGE_CONTAINER` | no | `datasets` | Blob container name | +| `BLOB_PREFIX` | yes | — | Blob path prefix for dataset files | +| `DATASET_ROOT` | no | `/workspace/data` | Local root directory for datasets | +| `DATASET_REPO_ID` | yes | — | Dataset identifier (e.g., `user/dataset`) | ## 🔗 Related Recipes diff --git a/docs/recipes/training/README.md b/docs/recipes/training/README.md index fb97f183..69fae565 100644 --- a/docs/recipes/training/README.md +++ b/docs/recipes/training/README.md @@ -4,11 +4,11 @@ Guides for training reinforcement learning and imitation learning policies using ## 📖 Recipes -| Recipe | Description | Time | -|--------|-------------|------| -| [Your First RL Training Job](your-first-rl-training-job.md) | Submit an Isaac Lab RL training job on OSMO with SKRL | 30 min | -| [Your First LeRobot Training Job](your-first-lerobot-training-job.md) | Submit a LeRobot behavioral cloning job on OSMO | 30 min | -| [End-to-End LeRobot Pipeline](end-to-end-lerobot-pipeline.md) | Orchestrate train, evaluate, and register in one command | 60 min | +| Recipe | Description | Time | +|-----------------------------------------------------------------------|----------------------------------------------------------|--------| +| [Your First RL Training Job](your-first-rl-training-job.md) | Submit an Isaac Lab RL training job on OSMO with SKRL | 30 min | +| [Your First LeRobot Training Job](your-first-lerobot-training-job.md) | Submit a LeRobot behavioral cloning job on OSMO | 30 min | +| [End-to-End LeRobot Pipeline](end-to-end-lerobot-pipeline.md) | Orchestrate train, evaluate, and register in one command | 60 min | ## 🔗 Related diff --git a/docs/recipes/training/end-to-end-lerobot-pipeline.md b/docs/recipes/training/end-to-end-lerobot-pipeline.md index 27ab1baf..657b3930 100644 --- a/docs/recipes/training/end-to-end-lerobot-pipeline.md +++ b/docs/recipes/training/end-to-end-lerobot-pipeline.md @@ -7,12 +7,12 @@ Run the full LeRobot pipeline — train a policy, evaluate it against simulation ## 📋 Prerequisites -| Requirement | Details | -|-------------|---------| -| Infrastructure | Azure resources deployed via Terraform | -| OSMO | Control plane and backend running | -| Basic LeRobot recipe | Single-stage training verified successfully | -| HuggingFace account | Write access to a policy repo for pushing trained weights | +| Requirement | Details | +|----------------------|-----------------------------------------------------------| +| Infrastructure | Azure resources deployed via Terraform | +| OSMO | Control plane and backend running | +| Basic LeRobot recipe | Single-stage training verified successfully | +| HuggingFace account | Write access to a policy repo for pushing trained weights | ## 🚀 Steps @@ -116,18 +116,18 @@ az ml model show \ ## ⚙️ Configuration Reference -| Parameter | Default | Description | -|-----------|---------|-------------| -| `-d, --dataset-repo-id` | (required) | HuggingFace dataset repository | -| `--policy-repo-id` | (required) | HuggingFace repo for trained policy | -| `--policy-type` | `act` | Policy architecture (`act` or `diffusion`) | -| `--training-steps` | (task default) | Total training iterations | -| `--eval-episodes` | `10` | Evaluation episodes | -| `--poll-interval` | `60` | Status check interval in seconds | -| `--timeout` | `720` | Training timeout in minutes | -| `--skip-inference` | (disabled) | Skip evaluation stage | -| `--skip-wait` | (disabled) | Async mode — submit without waiting | -| `-r, --register-model` | (none) | Model name for Azure ML registration | +| Parameter | Default | Description | +|-------------------------|----------------|--------------------------------------------| +| `-d, --dataset-repo-id` | (required) | HuggingFace dataset repository | +| `--policy-repo-id` | (required) | HuggingFace repo for trained policy | +| `--policy-type` | `act` | Policy architecture (`act` or `diffusion`) | +| `--training-steps` | (task default) | Total training iterations | +| `--eval-episodes` | `10` | Evaluation episodes | +| `--poll-interval` | `60` | Status check interval in seconds | +| `--timeout` | `720` | Training timeout in minutes | +| `--skip-inference` | (disabled) | Skip evaluation stage | +| `--skip-wait` | (disabled) | Async mode — submit without waiting | +| `-r, --register-model` | (none) | Model name for Azure ML registration | See [Scripts Reference](../../reference/scripts.md) for the full parameter table. diff --git a/docs/recipes/training/your-first-lerobot-training-job.md b/docs/recipes/training/your-first-lerobot-training-job.md index 2228c777..0dd65790 100644 --- a/docs/recipes/training/your-first-lerobot-training-job.md +++ b/docs/recipes/training/your-first-lerobot-training-job.md @@ -7,12 +7,12 @@ Submit a LeRobot behavioral cloning training job to OSMO using a HuggingFace dat ## 📋 Prerequisites -| Requirement | Details | -|-------------|---------| -| Infrastructure | Azure resources deployed via Terraform | -| OSMO | Control plane and backend running | -| VPN | Connected to private cluster (if using private AKS) | -| Azure CLI | Authenticated (`az login`) | +| Requirement | Details | +|----------------|-----------------------------------------------------| +| Infrastructure | Azure resources deployed via Terraform | +| OSMO | Control plane and backend running | +| VPN | Connected to private cluster (if using private AKS) | +| Azure CLI | Authenticated (`az login`) | ## 🚀 Steps @@ -110,17 +110,17 @@ The recipe succeeded when: ## ⚙️ Configuration Reference -| Parameter | Default | Description | -|-----------|---------|-------------| -| `-d, --dataset-repo-id` | (required) | HuggingFace dataset repository | -| `--policy-type` | `act` | Policy architecture (`act` or `diffusion`) | -| `--training-steps` | `100000` | Total training iterations | -| `--batch-size` | `32` | Training batch size | -| `--learning-rate` | `1e-4` | Optimizer learning rate | -| `--save-freq` | `5000` | Checkpoint save frequency | -| `--val-split` | `0.1` | Validation split ratio | -| `--from-blob` | (disabled) | Use Azure Blob Storage as data source | -| `--register-checkpoint` | (none) | Model name for Azure ML registration | +| Parameter | Default | Description | +|-------------------------|------------|--------------------------------------------| +| `-d, --dataset-repo-id` | (required) | HuggingFace dataset repository | +| `--policy-type` | `act` | Policy architecture (`act` or `diffusion`) | +| `--training-steps` | `100000` | Total training iterations | +| `--batch-size` | `32` | Training batch size | +| `--learning-rate` | `1e-4` | Optimizer learning rate | +| `--save-freq` | `5000` | Checkpoint save frequency | +| `--val-split` | `0.1` | Validation split ratio | +| `--from-blob` | (disabled) | Use Azure Blob Storage as data source | +| `--register-checkpoint` | (none) | Model name for Azure ML registration | See [Scripts Reference](../../reference/scripts.md) for the full parameter table. diff --git a/docs/recipes/training/your-first-rl-training-job.md b/docs/recipes/training/your-first-rl-training-job.md index 36c32421..7341faf7 100644 --- a/docs/recipes/training/your-first-rl-training-job.md +++ b/docs/recipes/training/your-first-rl-training-job.md @@ -7,13 +7,13 @@ Submit an Isaac Lab RL training job to OSMO and verify that training metrics app ## 📋 Prerequisites -| Requirement | Details | -|-------------|---------| -| Infrastructure | Azure resources deployed via Terraform | -| OSMO | Control plane and backend running (`kubectl get pods -n osmo-control-plane`) | -| VPN | Connected to private cluster (if using private AKS) | -| Azure CLI | Authenticated (`az login`) | -| kubectl | Connected to AKS cluster | +| Requirement | Details | +|----------------|------------------------------------------------------------------------------| +| Infrastructure | Azure resources deployed via Terraform | +| OSMO | Control plane and backend running (`kubectl get pods -n osmo-control-plane`) | +| VPN | Connected to private cluster (if using private AKS) | +| Azure CLI | Authenticated (`az login`) | +| kubectl | Connected to AKS cluster | ## 🚀 Steps @@ -105,15 +105,15 @@ The recipe succeeded when: ## ⚙️ Configuration Reference -| Parameter | Default | Description | -|-----------|---------|-------------| -| `--task` | `Isaac-Velocity-Rough-Anymal-C-v0` | Isaac Lab task environment | -| `--num-envs` | `2048` | Parallel simulation environments | -| `--max-iterations` | (unset) | Training iterations; omit for task default | -| `--backend` | `skrl` | Training backend (`skrl` or `rsl_rl`) | -| `--gpu` | `1` | GPU count | -| `--checkpoint-mode` | `from-scratch` | `from-scratch`, `warm-start`, or `resume` | -| `--register-checkpoint` | (none) | Model name for Azure ML registration | +| Parameter | Default | Description | +|-------------------------|------------------------------------|--------------------------------------------| +| `--task` | `Isaac-Velocity-Rough-Anymal-C-v0` | Isaac Lab task environment | +| `--num-envs` | `2048` | Parallel simulation environments | +| `--max-iterations` | (unset) | Training iterations; omit for task default | +| `--backend` | `skrl` | Training backend (`skrl` or `rsl_rl`) | +| `--gpu` | `1` | GPU count | +| `--checkpoint-mode` | `from-scratch` | `from-scratch`, `warm-start`, or `resume` | +| `--register-checkpoint` | (none) | Model name for Azure ML registration | See [Scripts Reference](../../reference/scripts.md) for the full parameter table. diff --git a/infrastructure/setup/optional/isaac-sim-vm/README.md b/infrastructure/setup/optional/isaac-sim-vm/README.md index 436b9b7b..d1c1aaf6 100644 --- a/infrastructure/setup/optional/isaac-sim-vm/README.md +++ b/infrastructure/setup/optional/isaac-sim-vm/README.md @@ -114,12 +114,12 @@ bash infrastructure/setup/optional/deploy-isaac-sim-vm.sh \ The script reads these values from Terraform outputs by default: -| Value | Terraform output | -| ----- | ---------------- | -| Resource group | `resource_group.value.name` | -| Location | `resource_group.value.location` | -| Dedicated VM subnet | `vm_subnet.value.id` | -| Shared NSG | `network_security_group.value.id` | +| Value | Terraform output | +|---------------------|-----------------------------------| +| Resource group | `resource_group.value.name` | +| Location | `resource_group.value.location` | +| Dedicated VM subnet | `vm_subnet.value.id` | +| Shared NSG | `network_security_group.value.id` | If `terraform.tfstate` is unavailable, pass `--tfvars-file` with a Terraform variables file that includes the same top-level fields used in `terraform.tfvars.example`. The script derives the standard resource names from that file, then resolves the subnet and NSG IDs from Azure. @@ -280,29 +280,29 @@ az network public-ip delete \ The deployment template in `main.bicep` accepts the following parameters. -| Name | Type | Required | Declared default | Description | -| ------------------------------ | -------------- | -------- | -------------------------- | ----------- | -| `vmName` | `string` | Yes | None | Name of the virtual machine to deploy. | -| `location` | `string` | No | `resourceGroup().location` | Azure region for deployed resources. | -| `vmResourceGroup` | `string` | No | `resourceGroup().name` | Resource group that receives the VM resources. | -| `tags` | `CommonTags?` | No | `null` | Tags applied to created resources. When `null`, `defaultCommonTags` is used as the effective value. | -| `subnetId` | `string` | Yes | None | Resource ID of the existing subnet used by the VM NIC. | -| `nsgId` | `string` | Yes | None | Resource ID of the existing network security group associated with the VM NIC. | -| `enableSubnetNatGatewayEgress` | `bool` | No | `false` | Deploy a NAT gateway and attach it to the target subnet for outbound internet egress without a VM public IP. | -| `natGatewayName` | `string` | No | `''` | NAT gateway name override when `enableSubnetNatGatewayEgress` is `true`. | -| `natGatewayPublicIpName` | `string` | No | `''` | Public IP name override for the NAT gateway when `enableSubnetNatGatewayEgress` is `true`. | -| `adminUsername` | `string` | Yes | None | Admin username for the Linux VM. | -| `adminPassword` | `securestring` | Yes | None | Admin password for the Linux VM. | -| `vmSize` | `string` | No | `Standard_NV36ads_A10_v5` | Virtual machine size. | -| `shouldEnableEncryptionAtHost` | `bool` | No | `true` | Enables EncryptionAtHost for the VM so host caches and temp disk data are encrypted. | -| `vmPriority` | `string` | No | `Regular` | VM priority. Use `Spot` only for test workloads that can be interrupted. | -| `spotEvictionPolicy` | `string` | No | `Deallocate` | Eviction policy used when `vmPriority` is `Spot`. | -| `image` | `ImageConfig?` | No | `null` | Marketplace image configuration. When `null`, `defaultImageConfig` is used as the effective value. | -| `plan` | `PlanConfig?` | No | `null` | Marketplace plan configuration. When `null`, `defaultPlanConfig` is used as the effective value. | -| `osDisk` | `DiskConfig?` | No | `null` | OS disk configuration. When `null`, `defaultOsDiskConfig` is used as the effective value. | -| `dataDisk` | `DiskConfig?` | No | `null` | Data disk configuration. When `null`, `defaultDataDiskConfig` is used as the effective value. | -| `shutdownSchedule` | `ShutdownSchedule?` | No | `null` | Daily auto-shutdown schedule. When `null`, `defaultShutdownSchedule` is used as the effective value. | -| `mdeLinux` | `object?` | No | `null` | Defender for Endpoint extension settings. Set `{}` to enable with defaults. Set `null` to skip deployment. | +| Name | Type | Required | Declared default | Description | +|--------------------------------|---------------------|----------|----------------------------|--------------------------------------------------------------------------------------------------------------| +| `vmName` | `string` | Yes | None | Name of the virtual machine to deploy. | +| `location` | `string` | No | `resourceGroup().location` | Azure region for deployed resources. | +| `vmResourceGroup` | `string` | No | `resourceGroup().name` | Resource group that receives the VM resources. | +| `tags` | `CommonTags?` | No | `null` | Tags applied to created resources. When `null`, `defaultCommonTags` is used as the effective value. | +| `subnetId` | `string` | Yes | None | Resource ID of the existing subnet used by the VM NIC. | +| `nsgId` | `string` | Yes | None | Resource ID of the existing network security group associated with the VM NIC. | +| `enableSubnetNatGatewayEgress` | `bool` | No | `false` | Deploy a NAT gateway and attach it to the target subnet for outbound internet egress without a VM public IP. | +| `natGatewayName` | `string` | No | `''` | NAT gateway name override when `enableSubnetNatGatewayEgress` is `true`. | +| `natGatewayPublicIpName` | `string` | No | `''` | Public IP name override for the NAT gateway when `enableSubnetNatGatewayEgress` is `true`. | +| `adminUsername` | `string` | Yes | None | Admin username for the Linux VM. | +| `adminPassword` | `securestring` | Yes | None | Admin password for the Linux VM. | +| `vmSize` | `string` | No | `Standard_NV36ads_A10_v5` | Virtual machine size. | +| `shouldEnableEncryptionAtHost` | `bool` | No | `true` | Enables EncryptionAtHost for the VM so host caches and temp disk data are encrypted. | +| `vmPriority` | `string` | No | `Regular` | VM priority. Use `Spot` only for test workloads that can be interrupted. | +| `spotEvictionPolicy` | `string` | No | `Deallocate` | Eviction policy used when `vmPriority` is `Spot`. | +| `image` | `ImageConfig?` | No | `null` | Marketplace image configuration. When `null`, `defaultImageConfig` is used as the effective value. | +| `plan` | `PlanConfig?` | No | `null` | Marketplace plan configuration. When `null`, `defaultPlanConfig` is used as the effective value. | +| `osDisk` | `DiskConfig?` | No | `null` | OS disk configuration. When `null`, `defaultOsDiskConfig` is used as the effective value. | +| `dataDisk` | `DiskConfig?` | No | `null` | Data disk configuration. When `null`, `defaultDataDiskConfig` is used as the effective value. | +| `shutdownSchedule` | `ShutdownSchedule?` | No | `null` | Daily auto-shutdown schedule. When `null`, `defaultShutdownSchedule` is used as the effective value. | +| `mdeLinux` | `object?` | No | `null` | Defender for Endpoint extension settings. Set `{}` to enable with defaults. Set `null` to skip deployment. | ### Structured parameter types diff --git a/infrastructure/terraform/modules/automation/README.md b/infrastructure/terraform/modules/automation/README.md index f2ead54e..779d1224 100644 --- a/infrastructure/terraform/modules/automation/README.md +++ b/infrastructure/terraform/modules/automation/README.md @@ -11,8 +11,8 @@ for automated startup of AKS clusters and PostgreSQL servers. ## 📖 Documentation -| Reference | Description | -|------------------------------------|-----------------------------------------------| +| Reference | Description | +|-------------------------------------|-----------------------------------------------| | [Terraform Reference](TERRAFORM.md) | Auto-generated inputs, outputs, and resources | diff --git a/infrastructure/terraform/modules/automation/variables.core.tf b/infrastructure/terraform/modules/automation/variables.core.tf index 6bb04628..1b3b58b4 100644 --- a/infrastructure/terraform/modules/automation/variables.core.tf +++ b/infrastructure/terraform/modules/automation/variables.core.tf @@ -12,7 +12,6 @@ variable "environment" { type = string description = "Environment for all resources in this module: dev, test, or prod" - default = "dev" } variable "instance" { @@ -24,7 +23,6 @@ variable "instance" { variable "location" { type = string description = "Location for all resources in this module" - default = null } variable "resource_group" { @@ -40,9 +38,3 @@ variable "resource_prefix" { type = string description = "Prefix for all resources in this module" } - -variable "tags" { - type = map(string) - description = "Tags to apply to all resources" - default = {} -} diff --git a/infrastructure/terraform/modules/automation/variables.tf b/infrastructure/terraform/modules/automation/variables.tf index 41823794..7e2efd88 100644 --- a/infrastructure/terraform/modules/automation/variables.tf +++ b/infrastructure/terraform/modules/automation/variables.tf @@ -51,3 +51,11 @@ variable "schedule_config" { timezone = "UTC" } } + +// === Tags === + +variable "tags" { + description = "Tags to apply to all resources created by this module" + type = map(string) + default = {} +} diff --git a/infrastructure/terraform/modules/dataviewer/README.md b/infrastructure/terraform/modules/dataviewer/README.md index 89598b96..6c58677e 100644 --- a/infrastructure/terraform/modules/dataviewer/README.md +++ b/infrastructure/terraform/modules/dataviewer/README.md @@ -15,8 +15,8 @@ Supports internal (VNet/VPN) and external (public) deployment modes. ## 📖 Documentation -| Reference | Description | -|------------------------------------|-----------------------------------------------| +| Reference | Description | +|-------------------------------------|-----------------------------------------------| | [Terraform Reference](TERRAFORM.md) | Auto-generated inputs, outputs, and resources | diff --git a/infrastructure/terraform/modules/platform/README.md b/infrastructure/terraform/modules/platform/README.md index 46ffe070..2e8a3052 100644 --- a/infrastructure/terraform/modules/platform/README.md +++ b/infrastructure/terraform/modules/platform/README.md @@ -12,8 +12,8 @@ Optional: PostgreSQL and Redis for OSMO workloads. ## 📖 Documentation -| Reference | Description | -|------------------------------------|-----------------------------------------------| +| Reference | Description | +|-------------------------------------|-----------------------------------------------| | [Terraform Reference](TERRAFORM.md) | Auto-generated inputs, outputs, and resources | diff --git a/infrastructure/terraform/modules/sil/README.md b/infrastructure/terraform/modules/sil/README.md index d0b5322e..0fac753d 100644 --- a/infrastructure/terraform/modules/sil/README.md +++ b/infrastructure/terraform/modules/sil/README.md @@ -12,8 +12,8 @@ and Data Collection Rule associations for observability. ## 📖 Documentation -| Reference | Description | -|------------------------------------|-----------------------------------------------| +| Reference | Description | +|-------------------------------------|-----------------------------------------------| | [Terraform Reference](TERRAFORM.md) | Auto-generated inputs, outputs, and resources | diff --git a/infrastructure/terraform/modules/vpn/README.md b/infrastructure/terraform/modules/vpn/README.md index fc65911a..8932dbf1 100644 --- a/infrastructure/terraform/modules/vpn/README.md +++ b/infrastructure/terraform/modules/vpn/README.md @@ -11,8 +11,8 @@ Creates GatewaySubnet within the platform's virtual network. ## 📖 Documentation -| Reference | Description | -|------------------------------------|-----------------------------------------------| +| Reference | Description | +|-------------------------------------|-----------------------------------------------| | [Terraform Reference](TERRAFORM.md) | Auto-generated inputs, outputs, and resources | diff --git a/infrastructure/terraform/modules/vpn/variables.core.tf b/infrastructure/terraform/modules/vpn/variables.core.tf index 64e4955c..1b3b58b4 100644 --- a/infrastructure/terraform/modules/vpn/variables.core.tf +++ b/infrastructure/terraform/modules/vpn/variables.core.tf @@ -38,9 +38,3 @@ variable "resource_prefix" { type = string description = "Prefix for all resources in this module" } - -variable "tags" { - type = map(string) - description = "Tags to apply to all resources" - default = {} -} diff --git a/infrastructure/terraform/modules/vpn/variables.tf b/infrastructure/terraform/modules/vpn/variables.tf index f5dba571..6adf5e0a 100644 --- a/infrastructure/terraform/modules/vpn/variables.tf +++ b/infrastructure/terraform/modules/vpn/variables.tf @@ -132,3 +132,11 @@ variable "vpn_site_default_ipsec_policy" { description = "Default IPsec policy for all S2S connections" default = null } + +// === Tags === + +variable "tags" { + description = "Tags to apply to all resources created by this module" + type = map(string) + default = {} +} diff --git a/setup-dev.ps1 b/setup-dev.ps1 index 753a18ea..4c2e1052 100644 --- a/setup-dev.ps1 +++ b/setup-dev.ps1 @@ -118,6 +118,35 @@ if (-not (Get-Command uv -ErrorAction SilentlyContinue)) { Write-Info "Using uv: $(uv --version)" +# =================================================================== +# Terraform-Docs +# =================================================================== +Write-Section 'Terraform-Docs Setup' + +$TerraformDocsVersion = '0.21.0' + +if (Get-Command terraform-docs -ErrorAction SilentlyContinue) { + Write-Info "terraform-docs: $(terraform-docs --version)" +} else { + Write-Info "Installing terraform-docs v$TerraformDocsVersion..." + $arch = if ($env:PROCESSOR_ARCHITECTURE -eq 'ARM64') { 'arm64' } else { 'amd64' } + $os = if ($IsLinux) { 'linux' } elseif ($IsMacOS) { 'darwin' } else { 'windows' } + $ext = if ($os -eq 'windows') { 'zip' } else { 'tar.gz' } + $url = "https://github.com/terraform-docs/terraform-docs/releases/download/v$TerraformDocsVersion/terraform-docs-v$TerraformDocsVersion-$os-$arch.$ext" + $dest = Join-Path $env:TEMP "terraform-docs.$ext" + Invoke-WebRequest -Uri $url -OutFile $dest + if ($os -eq 'windows') { + Expand-Archive -Path $dest -DestinationPath $env:TEMP -Force + Move-Item (Join-Path $env:TEMP 'terraform-docs.exe') (Join-Path $env:LOCALAPPDATA 'Microsoft\WindowsApps\terraform-docs.exe') -Force + } else { + tar -xzf $dest -C /tmp terraform-docs + sudo mv /tmp/terraform-docs /usr/local/bin/terraform-docs + sudo chmod +x /usr/local/bin/terraform-docs + } + Remove-Item $dest -ErrorAction SilentlyContinue + Write-Info "terraform-docs: v$TerraformDocsVersion (installed)" +} + Write-Section 'Python Environment Setup' $PythonVersion = Get-Content (Join-Path $ScriptDir '.python-version') -Raw diff --git a/setup-dev.sh b/setup-dev.sh index d1950ef4..2929e24d 100755 --- a/setup-dev.sh +++ b/setup-dev.sh @@ -52,6 +52,33 @@ fi info "Using uv: $(uv --version)" +# =================================================================== +# Terraform-Docs +# =================================================================== +section "Terraform-Docs Setup" + +TERRAFORM_DOCS_VERSION="0.21.0" + +ARCH=$(uname -m) +case "${ARCH}" in + x86_64) ARCH="amd64" ;; + aarch64|arm64) ARCH="arm64" ;; + *) error "Unsupported architecture: ${ARCH}"; exit 1 ;; +esac + +if command -v terraform-docs &>/dev/null; then + info "terraform-docs: $(terraform-docs --version)" +else + info "Installing terraform-docs v${TERRAFORM_DOCS_VERSION}..." + curl -sSLo /tmp/terraform-docs.tar.gz \ + "https://github.com/terraform-docs/terraform-docs/releases/download/v${TERRAFORM_DOCS_VERSION}/terraform-docs-v${TERRAFORM_DOCS_VERSION}-$(uname -s | tr '[:upper:]' '[:lower:]')-${ARCH}.tar.gz" + tar -xzf /tmp/terraform-docs.tar.gz -C /tmp terraform-docs + sudo mv /tmp/terraform-docs /usr/local/bin/terraform-docs + sudo chmod +x /usr/local/bin/terraform-docs + rm -f /tmp/terraform-docs.tar.gz + info "terraform-docs: v${TERRAFORM_DOCS_VERSION} (installed)" +fi + section "Python Environment Setup" PYTHON_VERSION="$(cat "${SCRIPT_DIR}/.python-version")" diff --git a/workflows/azureml/README.md b/workflows/azureml/README.md index 6bf97531..7eefcb5b 100644 --- a/workflows/azureml/README.md +++ b/workflows/azureml/README.md @@ -10,11 +10,11 @@ Azure Machine Learning job templates for Isaac Lab training and validation workl ## 📜 Available Templates -| Template | Purpose | Submission Script | -|---------------------------------------------------------------------------------|---------------------------------------|----------------------------------------------------------| -| [train.yaml](../../training/rl/workflows/azureml/train.yaml) | Training jobs with checkpoint support | `training/rl/scripts/submit-azureml-training.sh` | -| [validate.yaml](../../evaluation/sil/workflows/azureml/validate.yaml) | Policy validation and inference | `evaluation/sil/scripts/submit-azureml-validation.sh` | -| [lerobot-train.yaml](../../training/il/workflows/azureml/lerobot-train.yaml) | LeRobot behavioral cloning training | `training/il/scripts/submit-azureml-lerobot-training.sh` | +| Template | Purpose | Submission Script | +|------------------------------------------------------------------------------|---------------------------------------|----------------------------------------------------------| +| [train.yaml](../../training/rl/workflows/azureml/train.yaml) | Training jobs with checkpoint support | `training/rl/scripts/submit-azureml-training.sh` | +| [validate.yaml](../../evaluation/sil/workflows/azureml/validate.yaml) | Policy validation and inference | `evaluation/sil/scripts/submit-azureml-validation.sh` | +| [lerobot-train.yaml](../../training/il/workflows/azureml/lerobot-train.yaml) | LeRobot behavioral cloning training | `training/il/scripts/submit-azureml-lerobot-training.sh` | ## 🏋️ Training Job (`train.yaml`)