diff --git a/.github/workflows/pr-validation.yml b/.github/workflows/pr-validation.yml index 4e20e0fe..9787c068 100644 --- a/.github/workflows/pr-validation.yml +++ b/.github/workflows/pr-validation.yml @@ -302,6 +302,17 @@ jobs: permissions: contents: read + # Terraform security scanning via Checkov (soft-fail until matrix is fully addressed) + terraform-security: + name: Terraform Security + uses: ./.github/workflows/terraform-security.yml + with: + soft-fail: true + working-directory: infrastructure/terraform + permissions: + contents: read + security-events: write + # Terraform test execution with Codecov Test Analytics terraform-tests: name: Terraform Tests diff --git a/.github/workflows/terraform-security.yml b/.github/workflows/terraform-security.yml new file mode 100644 index 00000000..35556434 --- /dev/null +++ b/.github/workflows/terraform-security.yml @@ -0,0 +1,57 @@ +name: Terraform Security Scan + +on: + workflow_call: + inputs: + working-directory: + description: Directory passed to checkov via -d + required: false + type: string + default: infrastructure/terraform + soft-fail: + description: Whether to continue on Checkov violations + required: false + type: boolean + default: true + +permissions: + contents: read + +jobs: + checkov: + name: Checkov + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run Checkov + id: checkov + uses: bridgecrewio/checkov-action@99bb2caf247dfd9f03cf984373bc6043d4e32ebf # v12.1347.0 + continue-on-error: ${{ inputs.soft-fail }} + with: + directory: ${{ inputs.working-directory }} + framework: terraform + output_format: sarif + soft_fail: ${{ inputs.soft-fail }} + download_external_modules: false + + - name: Upload SARIF to GitHub code scanning + if: always() + uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + with: + sarif_file: results.sarif + category: checkov + + - name: Upload Checkov SARIF artifact + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: checkov-sarif + path: results.sarif + retention-days: 30 diff --git a/infrastructure/README.md b/infrastructure/README.md index 7c6a37d4..c1719f70 100644 --- a/infrastructure/README.md +++ b/infrastructure/README.md @@ -59,6 +59,11 @@ terraform init && terraform plan -var-file=terraform.tfvars terraform apply -var-file=terraform.tfvars ``` +> [!IMPORTANT] +> The conversion-pipeline module (`should_deploy_conversion_pipeline = true`) uses the `microsoft/fabric` provider, which authenticates via Azure CLI. Run `az login` before `terraform plan` / `apply`. The signed-in identity must be in a security group allow-listed under the Fabric tenant admin setting "Service principals can use Fabric APIs" (or the equivalent user/CLI-context allow-list). +> +> The conversion pipeline writes to the platform-owned data-lake account (`stdl...`); set `should_create_data_lake_storage = true` whenever `should_deploy_conversion_pipeline = true`. + ### 3. Connect to the cluster ```bash diff --git a/infrastructure/examples/terraform.tfvars.dev b/infrastructure/examples/terraform.tfvars.dev index b6679019..51d8b529 100644 --- a/infrastructure/examples/terraform.tfvars.dev +++ b/infrastructure/examples/terraform.tfvars.dev @@ -54,3 +54,21 @@ should_enable_microsoft_defender = false // should_deploy_monitor_workspace = true // should_deploy_ampls = false // should_deploy_dce = false + +// ============================================================================= +// Conversion Pipeline (Optional) +// ============================================================================= +// Cost-optimized for dev: F2 capacity. The conversion pipeline reuses the +// platform stdl... data-lake account, so should_create_data_lake_storage must +// be true whenever should_deploy_conversion_pipeline is true (enforced by a +// root-level precondition). +// ============================================================================= + +should_deploy_conversion_pipeline = false +should_create_data_lake_storage = true +conversion_pipeline_config = { + should_create_fabric_capacity = true + should_create_fabric_workspace = true + fabric_capacity_sku = "F2" + should_enable_event_grid_dead_letter = true +} diff --git a/infrastructure/examples/terraform.tfvars.prod b/infrastructure/examples/terraform.tfvars.prod index 1c1d5e46..ebd7b0fa 100644 --- a/infrastructure/examples/terraform.tfvars.prod +++ b/infrastructure/examples/terraform.tfvars.prod @@ -82,3 +82,21 @@ should_enable_purge_protection = true // should_deploy_monitor_workspace = true // should_deploy_ampls = true // should_deploy_dce = true + +// ============================================================================= +// Conversion Pipeline (Optional) +// ============================================================================= +// Production posture: F32 capacity. The conversion pipeline reuses the +// platform stdl... data-lake account, so should_create_data_lake_storage must +// be true whenever should_deploy_conversion_pipeline is true (enforced by a +// root-level precondition). +// ============================================================================= + +should_deploy_conversion_pipeline = false +should_create_data_lake_storage = true +conversion_pipeline_config = { + should_create_fabric_capacity = true + should_create_fabric_workspace = true + fabric_capacity_sku = "F32" + should_enable_event_grid_dead_letter = true +} diff --git a/infrastructure/examples/terraform.tfvars.staging b/infrastructure/examples/terraform.tfvars.staging new file mode 100644 index 00000000..2a97d66f --- /dev/null +++ b/infrastructure/examples/terraform.tfvars.staging @@ -0,0 +1,78 @@ +// ============================================================================= +// Staging Environment +// ============================================================================= +// Production-shaped (private networking, HA backend) but right-sized for +// pre-production validation. Smaller GPU footprint and F8 Fabric capacity. +// ============================================================================= + +// Core Configuration +environment = "staging" +location = "westus3" +resource_prefix = "roboticsstg" +instance = "001" + +// Resource Group +should_create_resource_group = true + +// AKS System Node Pool +system_node_pool_vm_size = "Standard_D8ds_v5" +system_node_pool_node_count = 2 + +// Single GPU pool with dedicated subnet +node_pools = { + rtxprogpu = { + vm_size = "Standard_NC128ds_xl_RTXPRO6000BSE_v6" + subnet_address_prefixes = ["10.0.7.0/24"] + node_taints = ["nvidia.com/gpu:NoSchedule"] + gpu_driver = "None" + node_labels = { + "nvidia.com/gpu.deploy.driver" = "false" + } + priority = "Regular" + should_enable_auto_scaling = true + min_count = 1 + max_count = 2 + zones = [] + } +} + +// OSMO Backend Services with HA +should_deploy_postgresql = true +should_deploy_redis = true + +// PostgreSQL HA +postgresql_sku_name = "GP_Standard_D2s_v3" +postgresql_high_availability = { + should_enable = true + standby_availability_zone = "2" +} + +// Redis HA +should_enable_redis_high_availability = true + +// Network Security — Full Private +should_enable_private_endpoint = true +should_enable_private_aks_cluster = true + +should_enable_public_network_access = false +should_add_current_user_key_vault_admin = true +should_enable_microsoft_defender = true +should_enable_purge_protection = false + +// ============================================================================= +// Conversion Pipeline (Optional) +// ============================================================================= +// Pre-production posture: F8 capacity. The conversion pipeline reuses the +// platform stdl... data-lake account, so should_create_data_lake_storage must +// be true whenever should_deploy_conversion_pipeline is true (enforced by a +// root-level precondition). +// ============================================================================= + +should_deploy_conversion_pipeline = false +should_create_data_lake_storage = true +conversion_pipeline_config = { + should_create_fabric_capacity = true + should_create_fabric_workspace = true + fabric_capacity_sku = "F8" + should_enable_event_grid_dead_letter = true +} diff --git a/infrastructure/terraform/TERRAFORM.md b/infrastructure/terraform/TERRAFORM.md index ad5b2f75..58eef722 100644 --- a/infrastructure/terraform/TERRAFORM.md +++ b/infrastructure/terraform/TERRAFORM.md @@ -2,7 +2,7 @@ title: Robotics Blueprint description: Deploys robotics infrastructure with NVIDIA GPU support, KAI Scheduler, and optional Azure Machine Learning integration. author: Microsoft Robotics-AI Team -ms.date: 2026-04-17 +ms.date: 2026-04-28 ms.topic: reference --- @@ -22,6 +22,7 @@ Architecture: | azapi | >= 2.3.0 | | azuread | >= 3.0.2 | | azurerm | >= 4.51.0 | +| fabric | 1.3.0 | | msgraph | >= 0.2.0 | | tls | >= 4.0.6 | @@ -35,19 +36,21 @@ Architecture: ## Resources -| Name | Type | -|-----------------------------------------------------------------------------------------------------------------------------------------|-------------| -| [azurerm_resource_group.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/resource_group) | resource | -| [msgraph_resource_action.current_user](https://registry.terraform.io/providers/microsoft/msgraph/latest/docs/resources/resource_action) | resource | -| [terraform_data.defer_resource_group](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | -| [azurerm_resource_group.existing](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group) | data source | +| Name | Type | +|-------------------------------------------------------------------------------------------------------------------------------------------|-------------| +| [azurerm_resource_group.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/resource_group) | resource | +| [msgraph_resource_action.current_user](https://registry.terraform.io/providers/microsoft/msgraph/latest/docs/resources/resource_action) | resource | +| [terraform_data.conversion_pipeline_precondition](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | +| [terraform_data.defer_resource_group](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | +| [azurerm_resource_group.existing](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group) | data source | ## Modules -| Name | Source | Version | -|----------|--------------------|---------| -| platform | ./modules/platform | n/a | -| sil | ./modules/sil | n/a | +| Name | Source | Version | +|----------------------|-------------------------------|---------| +| conversion\_pipeline | ./modules/conversion-pipeline | n/a | +| platform | ./modules/platform | n/a | +| sil | ./modules/sil | n/a | ## Inputs @@ -57,6 +60,7 @@ Architecture: | location | Location for all resources in this module | `string` | n/a | yes | | resource\_prefix | Prefix for all resources in this module | `string` | n/a | yes | | aml\_compute\_config | AzureML managed compute cluster configuration including VM size, priority, scaling, and optional subnet placement | ```object({ vm_size = string vm_priority = string min_node_count = number max_node_count = number scale_down_after_idle = optional(string, "PT5M") cluster_name = optional(string, "gpu-cluster") subnet_id = optional(string) })``` | ```{ "cluster_name": "gpu-cluster", "max_node_count": 1, "min_node_count": 0, "scale_down_after_idle": "PT5M", "subnet_id": null, "vm_priority": "LowPriority", "vm_size": "Standard_NC4as_T4_v3" }``` | no | +| conversion\_pipeline\_config | Conversion pipeline module configuration. Only consumed when should\_deploy\_conversion\_pipeline is true | ```object({ should_enable_event_grid_dead_letter = optional(bool, true) raw_blob_suffix_filters = optional(list(string), [".bag", ".bag.zst", ".mcap"]) conversion_subscriber_url = optional(string, null) should_create_fabric_capacity = optional(bool, true) should_create_fabric_workspace = optional(bool, true) fabric_capacity_sku = optional(string, "F2") fabric_admin_members = optional(list(string), []) fabric_workspace_sp_object_id = optional(string, null) })``` | `{}` | no | | converted\_datasets\_cool\_tier\_days | Number of days before tiering converted datasets to cool storage. Set to -1 to disable tiering | `number` | `90` | no | | instance | Instance identifier for naming resources: 001, 002, etc | `string` | `"001"` | no | | nat\_gateway\_zones | Availability zones for NAT Gateway and its public IP. Set to ["1"] in regions with AZ support. Leave empty for regions without AZ support (e.g. westus) | `list(string)` | ```[ "1" ]``` | no | @@ -82,6 +86,7 @@ Architecture: | should\_create\_vm\_subnet | Whether to create a dedicated subnet for virtual machines in the platform virtual network | `bool` | `false` | no | | should\_deploy\_aml\_compute | Whether to deploy an AzureML managed compute cluster for GPU workloads | `bool` | `false` | no | | should\_deploy\_ampls | Whether to deploy Azure Monitor Private Link Scope and its private endpoint | `bool` | `true` | no | +| should\_deploy\_conversion\_pipeline | Whether to deploy the conversion-pipeline module (raw -> converted ingest with Event Grid + Fabric) | `bool` | `false` | no | | should\_deploy\_dce | Whether to deploy Data Collection Endpoint for observability | `bool` | `true` | no | | should\_deploy\_grafana | Whether to deploy Azure Managed Grafana dashboard | `bool` | `true` | no | | should\_deploy\_monitor\_workspace | Whether to deploy Azure Monitor Workspace for Prometheus metrics | `bool` | `true` | no | @@ -112,35 +117,40 @@ Architecture: ## Outputs -| Name | Description | -|----------------------------------|------------------------------------------------------------------------------------------------| -| aks\_cluster | AKS cluster for robotics workloads. | -| aks\_oidc\_issuer\_url | OIDC issuer URL for workload identity. | -| aml\_compute\_cluster | AzureML managed compute cluster. Null when compute deployment is disabled. | -| application\_insights | Application Insights for application telemetry. | -| azureml\_workspace | Azure ML workspace for ML workloads. | -| container\_registry | Azure Container Registry for container images. | -| data\_lake\_storage\_account | Data lake storage account for domain data. Null when data lake is disabled. | -| dns\_server\_ip | The IP address to use as DNS server for VPN clients or on-premises DNS forwarding. | -| gpu\_node\_pool\_subnets | GPU node pool subnets created by SiL module. | -| grafana | Azure Managed Grafana for dashboards. | -| key\_vault | Key Vault storing robotics secrets. | -| key\_vault\_name | Key Vault name for script consumption. | -| log\_analytics\_workspace | Log Analytics Workspace for centralized logging. | -| managed\_redis\_connection\_info | Redis connection information for OSMO control plane. | -| ml\_workload\_identity | ML workload identity for federated credentials. | -| network\_security\_group | Shared network security group for robotics infrastructure. | -| node\_pools | GPU node pool configurations for OSMO pool and pod template generation | -| osmo\_workload\_identity | OSMO workload identity for deployment scripts | -| postgresql | PostgreSQL Flexible Server object. | -| postgresql\_connection\_info | PostgreSQL connection information for OSMO control plane. | -| private\_dns\_resolver | Private DNS Resolver for resolving private DNS zones from VPN clients or on-premises networks. | -| redis | Azure Redis Cache object. | -| resource\_group | Resource group for robotics infrastructure. | -| storage\_account | Storage account for ML workspace and general storage. | -| subnets | Subnet details from platform module. | -| virtual\_network | Virtual network for robotics infrastructure. | -| vm\_subnet | Dedicated VM subnet. Null when should\_create\_vm\_subnet is false. | +| Name | Description | +|---------------------------------------------------|----------------------------------------------------------------------------------------------------------| +| aks\_cluster | AKS cluster for robotics workloads. | +| aks\_oidc\_issuer\_url | OIDC issuer URL for workload identity. | +| aml\_compute\_cluster | AzureML managed compute cluster. Null when compute deployment is disabled. | +| application\_insights | Application Insights for application telemetry. | +| azureml\_workspace | Azure ML workspace for ML workloads. | +| container\_registry | Azure Container Registry for container images. | +| conversion\_pipeline\_event\_grid\_dlq\_container | Conversion pipeline Event Grid dead-letter container. Null when DLQ is disabled or pipeline is disabled. | +| conversion\_pipeline\_event\_grid\_subscription | Conversion pipeline Event Grid subscription. Null when conversion pipeline is disabled. | +| conversion\_pipeline\_event\_grid\_topic | Conversion pipeline Event Grid system topic. Null when conversion pipeline is disabled. | +| conversion\_pipeline\_fabric\_capacity | Conversion pipeline Microsoft Fabric capacity. Null when capacity is reused or pipeline is disabled. | +| conversion\_pipeline\_fabric\_workspace | Conversion pipeline Microsoft Fabric workspace. Null when conversion pipeline is disabled. | +| data\_lake\_storage\_account | Data lake storage account for domain data. Null when data lake is disabled. | +| dns\_server\_ip | The IP address to use as DNS server for VPN clients or on-premises DNS forwarding. | +| gpu\_node\_pool\_subnets | GPU node pool subnets created by SiL module. | +| grafana | Azure Managed Grafana for dashboards. | +| key\_vault | Key Vault storing robotics secrets. | +| key\_vault\_name | Key Vault name for script consumption. | +| log\_analytics\_workspace | Log Analytics Workspace for centralized logging. | +| managed\_redis\_connection\_info | Redis connection information for OSMO control plane. | +| ml\_workload\_identity | ML workload identity for federated credentials. | +| network\_security\_group | Shared network security group for robotics infrastructure. | +| node\_pools | GPU node pool configurations for OSMO pool and pod template generation | +| osmo\_workload\_identity | OSMO workload identity for deployment scripts | +| postgresql | PostgreSQL Flexible Server object. | +| postgresql\_connection\_info | PostgreSQL connection information for OSMO control plane. | +| private\_dns\_resolver | Private DNS Resolver for resolving private DNS zones from VPN clients or on-premises networks. | +| redis | Azure Redis Cache object. | +| resource\_group | Resource group for robotics infrastructure. | +| storage\_account | Storage account for ML workspace and general storage. | +| subnets | Subnet details from platform module. | +| virtual\_network | Virtual network for robotics infrastructure. | +| vm\_subnet | Dedicated VM subnet. Null when should\_create\_vm\_subnet is false. | diff --git a/infrastructure/terraform/automation/TERRAFORM.md b/infrastructure/terraform/automation/TERRAFORM.md index 93611a4f..0b9104e6 100644 --- a/infrastructure/terraform/automation/TERRAFORM.md +++ b/infrastructure/terraform/automation/TERRAFORM.md @@ -2,7 +2,7 @@ title: Azure Automation Standalone Configuration description: Deploys Azure Automation Account with scheduled runbook to start AKS cluster and PostgreSQL server every morning. Uses data sources to reference existing platform infrastructure. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/dns/TERRAFORM.md b/infrastructure/terraform/dns/TERRAFORM.md index 03317f8e..1c250c4d 100644 --- a/infrastructure/terraform/dns/TERRAFORM.md +++ b/infrastructure/terraform/dns/TERRAFORM.md @@ -2,7 +2,7 @@ title: Private DNS Zone for OSMO UI Service description: Creates a private DNS zone for internal resolution of the OSMO UI service running on an internal LoadBalancer within the AKS cluster. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index bc205da8..3d7c5ebd 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -209,3 +209,55 @@ module "sil" { // Feature flags should_enable_private_endpoint = var.should_enable_private_endpoint } + +// ============================================================ +// Conversion Pipeline Module - Raw -> Converted Ingest +// ============================================================ + +// Precondition guard: the conversion pipeline reuses the platform-owned +// data-lake account, so the platform must provision it. Module call blocks +// do not support `lifecycle.precondition` directly, so the check lives on a +// terraform_data resource that the module depends on. +resource "terraform_data" "conversion_pipeline_precondition" { + count = var.should_deploy_conversion_pipeline ? 1 : 0 + + lifecycle { + precondition { + condition = var.should_create_data_lake_storage + error_message = "should_deploy_conversion_pipeline = true requires should_create_data_lake_storage = true (the conversion pipeline reuses the platform stdl... account)." + } + } +} + +module "conversion_pipeline" { + source = "./modules/conversion-pipeline" + count = var.should_deploy_conversion_pipeline ? 1 : 0 + + depends_on = [ + module.platform, + terraform_data.conversion_pipeline_precondition, + ] + + // Core variables + environment = var.environment + resource_prefix = var.resource_prefix + instance = var.instance + location = var.location + resource_group = local.resource_group + + // Dependencies from platform module (typed objects) + data_lake_storage_account = module.platform.data_lake_storage_account + datasets_container = module.platform.datasets_container + + // Event Grid + should_enable_event_grid_dead_letter = var.conversion_pipeline_config.should_enable_event_grid_dead_letter + raw_blob_suffix_filters = var.conversion_pipeline_config.raw_blob_suffix_filters + conversion_subscriber_url = var.conversion_pipeline_config.conversion_subscriber_url + + // Fabric + should_create_fabric_capacity = var.conversion_pipeline_config.should_create_fabric_capacity + should_create_fabric_workspace = var.conversion_pipeline_config.should_create_fabric_workspace + fabric_capacity_sku = var.conversion_pipeline_config.fabric_capacity_sku + fabric_admin_members = var.conversion_pipeline_config.fabric_admin_members + fabric_workspace_sp_object_id = var.conversion_pipeline_config.fabric_workspace_sp_object_id +} diff --git a/infrastructure/terraform/modules/automation/TERRAFORM.md b/infrastructure/terraform/modules/automation/TERRAFORM.md index 7520c5c1..07049525 100644 --- a/infrastructure/terraform/modules/automation/TERRAFORM.md +++ b/infrastructure/terraform/modules/automation/TERRAFORM.md @@ -2,7 +2,7 @@ title: Azure Automation Module description: Creates an Azure Automation Account with a scheduled PowerShell runbook for automated startup of AKS clusters and PostgreSQL servers. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/modules/conversion-pipeline/README.md b/infrastructure/terraform/modules/conversion-pipeline/README.md new file mode 100644 index 00000000..1056a879 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/README.md @@ -0,0 +1,111 @@ +--- +title: Conversion Pipeline Module +description: Terraform module that provisions Event Grid and Microsoft Fabric on the platform-owned data-lake account for the raw -> converted ingest pipeline +author: Microsoft Robotics-AI Team +ms.date: 2026-04-28 +ms.topic: reference +--- + +Event Grid system topic + subscription, an in-account dead-letter container, and a Microsoft Fabric capacity + workspace wired against the platform-owned ADLS Gen2 data-lake account (`stdl...`) for the data-conversion pipeline. Raw blobs land under `datasets/raw/` and converted artifacts under `datasets/converted/` — both prefixes are owned by the platform module. + +The module is opt-in. The root composition gates it behind `should_deploy_conversion_pipeline = false` so existing deployments remain unaffected until the conversion compute (issues #32, #34, #72) is wired in. + +## 📋 Prerequisites + +| Requirement | Notes | +|------------------------------|------------------------------------------------------------------------------------------------------------------------| +| Terraform | `>= 1.9.8, < 2.0` | +| `azurerm` provider | `>= 4.51.0` | +| `microsoft/fabric` provider | `1.3.0` | +| Operator identity | Member of a security group allow-listed under the Fabric tenant admin setting "Service principals can use Fabric APIs" (or the equivalent user/CLI-context allow-list) | +| Authentication | `az login` against the target tenant. The Fabric provider falls back to Azure CLI auth when no provider block is declared | +| Platform module outputs | `data_lake_storage_account` (typed object: `{ id, name }`), `datasets_container` (typed object: `{ id, name }`) | + +> [!IMPORTANT] +> The conversion pipeline reuses the platform `stdl...` data-lake account. The root composition enforces this with a precondition: `should_deploy_conversion_pipeline = true` requires `should_create_data_lake_storage = true`. The check lives on a `terraform_data.conversion_pipeline_precondition` resource at root because Terraform does not support `lifecycle.precondition` inside `module` call blocks. + +## 🚀 Usage + +The module is composed by the root `infrastructure/terraform/main.tf`. To enable in any environment, set both flags in the corresponding tfvars file under `infrastructure/examples/`: + +```hcl +should_deploy_conversion_pipeline = true +should_create_data_lake_storage = true +conversion_pipeline_config = { + should_create_fabric_capacity = true + should_create_fabric_workspace = true + fabric_capacity_sku = "F8" + should_enable_event_grid_dead_letter = true + fabric_admin_members = ["[email protected]"] +} +``` + +> [!NOTE] +> Use `F2` in dev for cost. Staging and prod should size the Fabric capacity for expected workload concurrency (`F8`+). + +## 🏗️ Architecture + +```mermaid +flowchart LR + Edge[Edge ROS Recorder] -->|.bag.zst PUT| Raw[("stdl... datasets/raw/")] + Raw --> EGT[Event Grid System Topic] + EGT -->|advanced filter .bag/.bag.zst/.mcap| EGS[Event Subscription] + EGS -->|webhook| Func[Conversion Subscriber] + EGS -.->|delivery failure| DLQ[("stdl... event-grid-dlq")] + Func --> Conv[("stdl... datasets/converted/")] + Conv --> Fabric[Fabric Workspace] + Fabric --> Lakehouse[OneLake Shortcut -> datasets/converted/] +``` + +## ⚙️ Configuration + +| Variable | Default | Purpose | +|----------------------------------------|------------------------------------|------------------------------------------------------------------------------------------| +| `should_enable_event_grid_dead_letter` | `true` | Provision an in-account `event-grid-dlq` container on the platform data-lake account | +| `raw_blob_suffix_filters` | `[".bag", ".bag.zst", ".mcap"]` | Suffix list for the Event Grid `string_ends_with` advanced filter | +| `conversion_subscriber_url` | `null` | Webhook URL for the conversion subscriber. Subscription is DLQ-only when `null` | +| `should_create_fabric_capacity` | `true` | Provision a new Fabric capacity | +| `should_create_fabric_workspace` | `true` | Provision a Fabric workspace. `capacity_id` resolves at apply time via a deferred lookup | +| `fabric_capacity_sku` | `F2` | Fabric capacity SKU (`F2` through `F2048`) | +| `fabric_admin_members` | `[]` | Entra UPNs/object IDs granted Fabric capacity administration | +| `fabric_workspace_sp_object_id` | `null` | Object ID of the Fabric workspace SP. Grants `Storage Blob Data Reader` on the datasets container plus an ADLS Gen2 ACL granting `rwx` on `converted/` | + +## 📥 Inputs + +| Input | Type | Description | +|-----------------------------|---------------------------------------|----------------------------------------------------------------------------------------| +| `environment` | `string` | `dev`, `staging`, or `prod` | +| `resource_prefix` | `string` | Prefix used in resource naming | +| `instance` | `string` | Instance identifier (`001`, `002`, ...) | +| `location` | `string` | Optional location override; defaults to `resource_group.location` | +| `resource_group` | `object({ id, name, location })` | Resource group object | +| `data_lake_storage_account` | `object({ id, name })` | Platform-owned ADLS Gen2 data-lake account (`stdl...`) used as the durable raw -> converted store | +| `datasets_container` | `object({ id, name })` | Datasets container on the platform-owned data-lake account. Used to scope Fabric SP folder ACLs | + +## 📤 Outputs + +| Output | Shape | +|----------------------------|--------------------------------------------------------------------| +| `event_grid_topic` | `{ id, name, identity_principal_id }` | +| `event_grid_subscription` | `{ id, name }` | +| `event_grid_dlq_container` | `{ id, name }` or `null` when DLQ is disabled | +| `fabric_workspace` | `{ id, display_name }` or `null` when workspace creation is off | +| `fabric_capacity` | `{ id, name, sku }` or `null` when reusing an existing capacity | + +## 🔍 Validation + +| Check | Command | +|---------------------|--------------------------------------------------------------------------------------------------------------| +| Format | `terraform fmt -check -recursive infrastructure/terraform/modules/conversion-pipeline` | +| Lint | `npm run lint:tf` | +| Validate | `npm run lint:tf:validate` | +| Module unit tests | `cd infrastructure/terraform/modules/conversion-pipeline && terraform init -backend=false && terraform test` | +| Security scan | `checkov -d infrastructure/terraform/modules/conversion-pipeline --framework terraform` | + +## 🔧 Optional Components + +| Component | Toggle | Notes | +|------------------|----------------------------------------|----------------------------------------------------------------------| +| Dead-letter queue| `should_enable_event_grid_dead_letter` | Backed by an in-account `event-grid-dlq` container on the data-lake | +| Fabric capacity | `should_create_fabric_capacity` | Disable to reuse an existing capacity by display name | +| Fabric workspace | `should_create_fabric_workspace` | `capacity_id` is resolved at apply time via a deferred data lookup | diff --git a/infrastructure/terraform/modules/conversion-pipeline/TERRAFORM.md b/infrastructure/terraform/modules/conversion-pipeline/TERRAFORM.md new file mode 100644 index 00000000..73a750ba --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/TERRAFORM.md @@ -0,0 +1,80 @@ +--- +title: Conversion Pipeline Module +description: The conversion pipeline reuses the platform-owned ADLS Gen2 data-lake account (stdl...) for raw -> converted storage. This module owns only the Event Grid system topic + subscription that route BlobCreated events to the conversion subscriber, an in-account dead-letter container, and the Fabric capacity + workspace. +author: Microsoft Robotics-AI Team +ms.date: 2026-04-28 +ms.topic: reference +--- + + +The conversion pipeline reuses the platform-owned ADLS Gen2 data-lake +account (stdl...) for raw -> converted storage. This module owns only the +Event Grid system topic + subscription that route BlobCreated events to the +conversion subscriber, an in-account dead-letter container, and the Fabric +capacity + workspace. + +## Requirements + +| Name | Version | +|-----------|-----------------| +| terraform | >= 1.9.8, < 2.0 | +| azurerm | >= 4.51.0 | +| fabric | 1.3.0 | + +## Providers + +| Name | Version | +|-----------|-----------| +| azurerm | >= 4.51.0 | +| fabric | 1.3.0 | +| terraform | n/a | + +## Resources + +| Name | Type | +|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------| +| [azurerm_eventgrid_system_topic.blob](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/eventgrid_system_topic) | resource | +| [azurerm_eventgrid_system_topic_event_subscription.raw_blob_created](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/eventgrid_system_topic_event_subscription) | resource | +| [azurerm_fabric_capacity.this](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/fabric_capacity) | resource | +| [azurerm_role_assignment.eventgrid_dlq_writer](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/role_assignment) | resource | +| [azurerm_role_assignment.fabric_sp_data_lake_contributor](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/role_assignment) | resource | +| [azurerm_storage_container.event_grid_dlq](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/storage_container) | resource | +| [fabric_workspace.this](https://registry.terraform.io/providers/microsoft/fabric/1.3.0/docs/resources/workspace) | resource | +| [terraform_data.defer_fabric_capacity_created](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | +| [terraform_data.defer_fabric_capacity_existing](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource | +| [fabric_capacity.created](https://registry.terraform.io/providers/microsoft/fabric/1.3.0/docs/data-sources/capacity) | data source | +| [fabric_capacity.existing](https://registry.terraform.io/providers/microsoft/fabric/1.3.0/docs/data-sources/capacity) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------|---------------------------------------|:--------:| +| data\_lake\_storage\_account | Platform-owned ADLS Gen2 data-lake account (stdl...) used as the durable raw -> converted store | ```object({ id = string name = string })``` | n/a | yes | +| environment | Environment for all resources in this module: dev, staging, or prod | `string` | n/a | yes | +| resource\_group | Resource group object containing name, id, and location | ```object({ id = string name = string location = string })``` | n/a | yes | +| resource\_prefix | Prefix for all resources in this module | `string` | n/a | yes | +| conversion\_subscriber\_url | Optional webhook URL for the downstream conversion subscriber. When null, the subscription is created without a webhook destination (DLQ-only) until the conversion compute lands | `string` | `null` | no | +| fabric\_admin\_members | Entra UPNs or object IDs that should be granted Fabric capacity administration | `list(string)` | `[]` | no | +| fabric\_capacity\_sku | SKU for the Fabric capacity (F2 through F2048). Only used when should\_create\_fabric\_capacity is true | `string` | `"F2"` | no | +| fabric\_workspace\_sp\_object\_id | Object ID of the Fabric workspace service principal. When provided, the SP is granted Storage Blob Data Contributor on the platform data-lake account | `string` | `null` | no | +| instance | Instance identifier for naming resources: 001, 002, etc | `string` | `"001"` | no | +| location | Override location for module resources. Defaults to var.resource\_group.location when null | `string` | `null` | no | +| raw\_blob\_suffix\_filters | Suffix filters used by the Event Grid subscription's advanced\_filter.string\_ends\_with on the raw container | `list(string)` | ```[ ".bag", ".bag.zst", ".mcap" ]``` | no | +| should\_create\_fabric\_capacity | Whether to provision a new Fabric capacity | `bool` | `true` | no | +| should\_create\_fabric\_workspace | Whether to provision a Fabric workspace bound to the Fabric capacity. The workspace's capacity\_id is resolved at apply time from a deferred data "fabric\_capacity" lookup keyed on the capacity display name | `bool` | `true` | no | +| should\_enable\_event\_grid\_dead\_letter | Whether to enable an Event Grid dead-letter destination backed by an in-account container | `bool` | `true` | no | + +## Outputs + +| Name | Description | +|-----------------------------|-----------------------------------------------------------------------------------------------| +| event\_grid\_dlq\_container | Event Grid dead-letter container on the platform data-lake account. Null when DLQ is disabled | +| event\_grid\_subscription | Event Grid subscription for raw BlobCreated events | +| event\_grid\_topic | Event Grid system topic on the platform data-lake account | +| fabric\_capacity | Microsoft Fabric capacity. Null when an existing capacity is reused | +| fabric\_workspace | Microsoft Fabric workspace bound to the conversion capacity | + + + +*🤖 Auto-generated by [terraform-docs](https://terraform-docs.io/) — do not edit manually.* + diff --git a/infrastructure/terraform/modules/conversion-pipeline/event-grid.tf b/infrastructure/terraform/modules/conversion-pipeline/event-grid.tf new file mode 100644 index 00000000..3472ee30 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/event-grid.tf @@ -0,0 +1,71 @@ +/** + * # Event Grid System Topic and Subscription + * + * System topic on the platform-owned data-lake account (stdl...) that fires + * BlobCreated events on the `datasets/raw/` ADLS Gen2 path to the conversion + * subscriber (Function App or Fabric pipeline). Multi-suffix filtering uses + * advanced_filter.string_ends_with because subject_filter only supports a + * single suffix value. + */ + +resource "azurerm_eventgrid_system_topic" "blob" { + name = "evgt-${local.resource_name_suffix}" + location = local.location + resource_group_name = var.resource_group.name + source_resource_id = var.data_lake_storage_account.id + topic_type = "Microsoft.Storage.StorageAccounts" + + identity { + type = "SystemAssigned" + } +} + +resource "azurerm_eventgrid_system_topic_event_subscription" "raw_blob_created" { + name = "evgs-raw-${local.resource_name_suffix}" + system_topic = azurerm_eventgrid_system_topic.blob.name + resource_group_name = var.resource_group.name + + included_event_types = ["Microsoft.Storage.BlobCreated"] + + subject_filter { + subject_begins_with = "/blobServices/default/containers/datasets/blobs/raw/" + } + + advanced_filter { + string_ends_with { + key = "subject" + values = var.raw_blob_suffix_filters + } + } + + retry_policy { + max_delivery_attempts = 5 + event_time_to_live = 1440 + } + + dynamic "webhook_endpoint" { + for_each = var.conversion_subscriber_url == null ? [] : [1] + content { + url = var.conversion_subscriber_url + } + } + + dynamic "storage_blob_dead_letter_destination" { + for_each = var.should_enable_event_grid_dead_letter ? [1] : [] + content { + storage_account_id = var.data_lake_storage_account.id + storage_blob_container_name = azurerm_storage_container.event_grid_dlq[0].name + } + } + + dynamic "dead_letter_identity" { + for_each = var.should_enable_event_grid_dead_letter ? [1] : [] + content { + type = "SystemAssigned" + } + } + + delivery_identity { + type = "SystemAssigned" + } +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/fabric.tf b/infrastructure/terraform/modules/conversion-pipeline/fabric.tf new file mode 100644 index 00000000..59e5300d --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/fabric.tf @@ -0,0 +1,90 @@ +/** + * # Microsoft Fabric Capacity and Workspace + * + * Provisions an azurerm_fabric_capacity (gated by should_create_fabric_capacity) + * and a fabric_workspace via the microsoft/fabric provider. + * + * The fabric_workspace resource's capacity_id is the Fabric capacity GUID (not + * the ARM resource ID). The azurerm_fabric_capacity resource does not expose + * the GUID directly, so the GUID is discovered at apply time via a deferred + * data "fabric_capacity" lookup keyed on the capacity display name. The + * terraform_data shim defers data-source evaluation past plan so a single + * `terraform apply` provisions both the capacity and the workspace. + * + * Authentication for the microsoft/fabric provider falls back to Azure CLI + * (`az login`) when no provider block is declared. The signed-in operator + * identity must be in a security group allow-listed under the Fabric tenant + * admin setting "Service principals can use Fabric APIs" (or the equivalent + * user-context setting). + */ + +locals { + // Fabric Capacity name must match ^[a-z][a-z0-9]{2,62}$ (lowercase letters and digits only — + // no hyphens). The hyphenated `{abbreviation}-{prefix}-{environment}-{instance}` convention + // cannot apply here, so `fc` joins `kv`, `st`, `acr` as a no-hyphen exception. + fabric_capacity_name = "fc${var.resource_prefix}${var.environment}${var.instance}" + + fabric_capacity_id = try( + data.fabric_capacity.created[0].id, + data.fabric_capacity.existing[0].id, + null, + ) +} + +resource "azurerm_fabric_capacity" "this" { + count = var.should_create_fabric_capacity ? 1 : 0 + + name = local.fabric_capacity_name + resource_group_name = var.resource_group.name + location = local.location + + administration_members = var.fabric_admin_members + + sku { + name = var.fabric_capacity_sku + tier = "Fabric" + } +} + +// Defer data-source evaluation past `terraform plan` so the GUID is read only after +// the capacity has been created. terraform_data wraps the display_name input and +// declares the dependency on the azurerm_fabric_capacity resource. +resource "terraform_data" "defer_fabric_capacity_created" { + count = var.should_create_fabric_capacity ? 1 : 0 + + input = { + display_name = local.fabric_capacity_name + } + + depends_on = [azurerm_fabric_capacity.this] +} + +data "fabric_capacity" "created" { + count = length(terraform_data.defer_fabric_capacity_created) + + display_name = terraform_data.defer_fabric_capacity_created[0].output.display_name +} + +// When operating against a pre-existing capacity, defer the lookup the same way +// so the workspace can resolve capacity_id without a two-pass apply. +resource "terraform_data" "defer_fabric_capacity_existing" { + count = var.should_create_fabric_capacity ? 0 : (var.should_create_fabric_workspace ? 1 : 0) + + input = { + display_name = local.fabric_capacity_name + } +} + +data "fabric_capacity" "existing" { + count = length(terraform_data.defer_fabric_capacity_existing) + + display_name = terraform_data.defer_fabric_capacity_existing[0].output.display_name +} + +resource "fabric_workspace" "this" { + count = var.should_create_fabric_workspace ? 1 : 0 + + display_name = "fws-${local.resource_name_suffix}" + description = "Conversion pipeline workspace (${var.environment})" + capacity_id = local.fabric_capacity_id +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/main.tf b/infrastructure/terraform/modules/conversion-pipeline/main.tf new file mode 100644 index 00000000..2d4b2f34 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/main.tf @@ -0,0 +1,30 @@ +/** + * # Conversion Pipeline Module + * + * The conversion pipeline reuses the platform-owned ADLS Gen2 data-lake + * account (stdl...) for raw -> converted storage. This module owns only the + * Event Grid system topic + subscription that route BlobCreated events to the + * conversion subscriber, an in-account dead-letter container, and the Fabric + * capacity + workspace. + */ + +locals { + resource_name_suffix = "${var.resource_prefix}-${var.environment}-${var.instance}" + location = coalesce(var.location, var.resource_group.location) +} + +// ============================================================ +// Event Grid Dead-Letter Container +// ============================================================ + +resource "azurerm_storage_container" "event_grid_dlq" { + count = var.should_enable_event_grid_dead_letter ? 1 : 0 + + name = "event-grid-dlq" + storage_account_id = var.data_lake_storage_account.id + container_access_type = "private" + + lifecycle { + prevent_destroy = true + } +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/outputs.tf b/infrastructure/terraform/modules/conversion-pipeline/outputs.tf new file mode 100644 index 00000000..914770f0 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/outputs.tf @@ -0,0 +1,48 @@ +/** + * # Module Outputs + * + * Typed object outputs consumed by downstream modules (Function App / Fabric + * pipeline from issues #32, #34, #72) via the variables.deps.tf pattern. + */ + +output "event_grid_topic" { + description = "Event Grid system topic on the platform data-lake account" + value = { + id = azurerm_eventgrid_system_topic.blob.id + name = azurerm_eventgrid_system_topic.blob.name + identity_principal_id = azurerm_eventgrid_system_topic.blob.identity[0].principal_id + } +} + +output "event_grid_subscription" { + description = "Event Grid subscription for raw BlobCreated events" + value = { + id = azurerm_eventgrid_system_topic_event_subscription.raw_blob_created.id + name = azurerm_eventgrid_system_topic_event_subscription.raw_blob_created.name + } +} + +output "event_grid_dlq_container" { + description = "Event Grid dead-letter container on the platform data-lake account. Null when DLQ is disabled" + value = try({ + id = azurerm_storage_container.event_grid_dlq[0].id + name = azurerm_storage_container.event_grid_dlq[0].name + }, null) +} + +output "fabric_workspace" { + description = "Microsoft Fabric workspace bound to the conversion capacity" + value = try({ + id = fabric_workspace.this[0].id + display_name = fabric_workspace.this[0].display_name + }, null) +} + +output "fabric_capacity" { + description = "Microsoft Fabric capacity. Null when an existing capacity is reused" + value = try({ + id = azurerm_fabric_capacity.this[0].id + name = azurerm_fabric_capacity.this[0].name + sku = azurerm_fabric_capacity.this[0].sku[0].name + }, null) +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/role-assignments.tf b/infrastructure/terraform/modules/conversion-pipeline/role-assignments.tf new file mode 100644 index 00000000..7760a4cf --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/role-assignments.tf @@ -0,0 +1,66 @@ +/** + * # Role Assignments and ACLs + * + * - Event Grid system topic SystemAssigned MI gets Storage Blob Data Contributor + * on the DLQ container so it can write dead-letter blobs. + * - Fabric workspace service principal gets Storage Blob Data Reader at the + * datasets container scope (covers read+list across raw/ and converted/ and + * provides RBAC traverse, bypassing the POSIX --x requirement) plus an + * ADLS Gen2 ACL granting rwx on converted/ for write access. + */ + +resource "azurerm_role_assignment" "eventgrid_dlq_writer" { + count = var.should_enable_event_grid_dead_letter ? 1 : 0 + + scope = azurerm_storage_container.event_grid_dlq[0].id + role_definition_name = "Storage Blob Data Contributor" + principal_id = azurerm_eventgrid_system_topic.blob.identity[0].principal_id +} + +// Container-scoped read for the Fabric workspace SP across the entire datasets +// container. RBAC data permissions satisfy ADLS Gen2 traverse checks without +// requiring POSIX --x on intermediate directories. +resource "azurerm_role_assignment" "fabric_sp_datasets_reader" { + count = var.fabric_workspace_sp_object_id == null ? 0 : 1 + + scope = var.datasets_container.id + role_definition_name = "Storage Blob Data Reader" + principal_id = var.fabric_workspace_sp_object_id +} + +// Write access on datasets/converted/ for the Fabric workspace SP via folder +// ACL (default ACEs propagate the grant to new children). +resource "azurerm_storage_data_lake_gen2_path" "fabric_converted" { + count = var.fabric_workspace_sp_object_id == null ? 0 : 1 + + storage_account_id = var.data_lake_storage_account.id + filesystem_name = var.datasets_container.name + path = "converted" + resource = "directory" + + ace { + type = "user" + id = var.fabric_workspace_sp_object_id + scope = "access" + permissions = "rwx" + } + + ace { + type = "user" + id = var.fabric_workspace_sp_object_id + scope = "default" + permissions = "rwx" + } + + ace { + type = "mask" + scope = "access" + permissions = "rwx" + } + + ace { + type = "mask" + scope = "default" + permissions = "rwx" + } +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/tests/conversion_pipeline.tftest.hcl b/infrastructure/terraform/modules/conversion-pipeline/tests/conversion_pipeline.tftest.hcl new file mode 100644 index 00000000..80f66952 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/tests/conversion_pipeline.tftest.hcl @@ -0,0 +1,284 @@ +// Conversion pipeline module tests +// All runs use command = plan against mock providers; no Azure credentials required. +// +// The root-level precondition coupling should_deploy_conversion_pipeline to +// should_create_data_lake_storage lives on a terraform_data resource at root +// (DD-03) and cannot be exercised from within this module-scoped test file. + +mock_provider "azurerm" {} +mock_provider "fabric" {} + +run "setup" { + module { + source = "./tests/setup" + } +} + +// ============================================================ +// Default Naming +// ============================================================ + +run "default_naming" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + } + + assert { + condition = azurerm_eventgrid_system_topic.blob.name == "evgt-${run.setup.resource_prefix}-${run.setup.environment}-${run.setup.instance}" + error_message = "Event Grid system topic name must follow evgt-{suffix} convention." + } + + assert { + condition = azurerm_eventgrid_system_topic.blob.source_resource_id == run.setup.data_lake_storage_account.id + error_message = "Event Grid system topic must be parented to the platform data-lake account." + } +} + +// ============================================================ +// DLQ Container +// ============================================================ + +run "dlq_container_created" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + should_enable_event_grid_dead_letter = true + } + + assert { + condition = length(azurerm_storage_container.event_grid_dlq) == 1 + error_message = "Dead-letter container must be created when should_enable_event_grid_dead_letter is true." + } + + assert { + condition = azurerm_storage_container.event_grid_dlq[0].storage_account_id == run.setup.data_lake_storage_account.id + error_message = "Dead-letter container must be parented to the platform data-lake account." + } +} + +run "dlq_disabled_skips_container" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + should_enable_event_grid_dead_letter = false + } + + assert { + condition = length(azurerm_storage_container.event_grid_dlq) == 0 + error_message = "Dead-letter container must not be created when DLQ is disabled." + } +} + +// ============================================================ +// Event Grid Filters +// ============================================================ + +run "event_grid_filters" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + raw_blob_suffix_filters = [".bag", ".bag.zst", ".mcap"] + } + + assert { + condition = contains(azurerm_eventgrid_system_topic_event_subscription.raw_blob_created.advanced_filter[0].string_ends_with[0].values, ".bag.zst") + error_message = "Event Grid subscription must filter on .bag.zst suffix." + } + + assert { + condition = azurerm_eventgrid_system_topic_event_subscription.raw_blob_created.subject_filter[0].subject_begins_with == "/blobServices/default/containers/datasets/blobs/raw/" + error_message = "Event Grid subscription must use the ADLS Gen2 HNS subject prefix for the platform datasets/raw/ path." + } +} + +// ============================================================ +// Fabric Capacity / Workspace +// ============================================================ + +run "fabric_capacity_created" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + should_create_fabric_capacity = true + should_create_fabric_workspace = true + fabric_capacity_sku = "F2" + } + + assert { + condition = length(azurerm_fabric_capacity.this) == 1 + error_message = "Fabric capacity must be created when should_create_fabric_capacity is true." + } + + assert { + condition = length(fabric_workspace.this) == 1 + error_message = "Fabric workspace must be created when should_create_fabric_workspace is true." + } + + assert { + condition = length(data.fabric_capacity.created) == 1 + error_message = "Deferred data.fabric_capacity.created lookup must resolve when capacity creation is enabled." + } +} + +run "fabric_capacity_reused" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + should_create_fabric_capacity = false + should_create_fabric_workspace = true + } + + assert { + condition = length(azurerm_fabric_capacity.this) == 0 + error_message = "Fabric capacity must not be created when reusing an existing capacity." + } +} + +// ============================================================ +// Fabric SP Permissions +// ============================================================ + +run "fabric_sp_permissions_created" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + fabric_workspace_sp_object_id = "00000000-0000-0000-0000-000000000099" + } + + assert { + condition = length(azurerm_role_assignment.fabric_sp_datasets_reader) == 1 + error_message = "Fabric SP datasets Reader role assignment must be created when fabric_workspace_sp_object_id is set." + } + + assert { + condition = azurerm_role_assignment.fabric_sp_datasets_reader[0].role_definition_name == "Storage Blob Data Reader" + error_message = "Fabric SP must be granted Storage Blob Data Reader at container scope." + } + + assert { + condition = azurerm_role_assignment.fabric_sp_datasets_reader[0].scope == run.setup.datasets_container.id + error_message = "Fabric SP Reader role must be scoped to the datasets container." + } + + assert { + condition = length(azurerm_storage_data_lake_gen2_path.fabric_converted) == 1 + error_message = "Fabric SP converted/ ACL path must be created when fabric_workspace_sp_object_id is set." + } + + assert { + condition = azurerm_storage_data_lake_gen2_path.fabric_converted[0].path == "converted" + error_message = "Fabric SP converted/ ACL must target the converted directory." + } +} + +run "fabric_sp_permissions_skipped_when_unset" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + } + + assert { + condition = length(azurerm_role_assignment.fabric_sp_datasets_reader) == 0 + error_message = "Fabric SP Reader role must not be created when fabric_workspace_sp_object_id is null." + } + + assert { + condition = length(azurerm_storage_data_lake_gen2_path.fabric_converted) == 0 + error_message = "Fabric SP converted/ ACL must not be created when fabric_workspace_sp_object_id is null." + } +} + +// ============================================================ +// Variable Validation +// ============================================================ + +run "invalid_sku_rejected" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + fabric_capacity_sku = "F1" + } + + expect_failures = [var.fabric_capacity_sku] +} + +run "empty_suffix_filters_rejected" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + data_lake_storage_account = run.setup.data_lake_storage_account + datasets_container = run.setup.datasets_container + raw_blob_suffix_filters = [] + } + + expect_failures = [var.raw_blob_suffix_filters] +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/tests/setup/main.tf b/infrastructure/terraform/modules/conversion-pipeline/tests/setup/main.tf new file mode 100644 index 00000000..e38e5724 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/tests/setup/main.tf @@ -0,0 +1,79 @@ +// Setup module for conversion-pipeline tests +// Generates synthetic IDs/values matching the dependency object schemas +// expected by the module's variables.deps.tf. + +terraform { + required_version = ">= 1.9.8, < 2.0" + + required_providers { + random = { + source = "hashicorp/random" + version = ">= 3.6.0" + } + } +} + +resource "random_string" "prefix" { + length = 4 + special = false + upper = false + numeric = false +} + +locals { + subscription_id_part = "/subscriptions/00000000-0000-0000-0000-000000000000" + resource_prefix = "t${random_string.prefix.id}" + environment = "dev" + instance = "001" + location = "westus3" + resource_group_name = "rg-${local.resource_prefix}-${local.environment}-${local.instance}" + resource_group_id = "${local.subscription_id_part}/resourceGroups/${local.resource_group_name}" + data_lake_account_name = "stdl${local.resource_prefix}${local.environment}${local.instance}" + data_lake_account_id = "${local.resource_group_id}/providers/Microsoft.Storage/storageAccounts/${local.data_lake_account_name}" + datasets_container_id = "${local.data_lake_account_id}/blobServices/default/containers/datasets" +} + +output "resource_prefix" { + description = "Generated resource naming prefix for test isolation." + value = local.resource_prefix +} + +output "environment" { + description = "Environment identifier for test configuration." + value = local.environment +} + +output "instance" { + description = "Instance identifier for test configuration." + value = local.instance +} + +output "location" { + description = "Azure region for test resources." + value = local.location +} + +output "resource_group" { + description = "Mock resource group object." + value = { + id = local.resource_group_id + name = local.resource_group_name + location = local.location + } +} + +output "data_lake_storage_account" { + description = "Mock platform-owned data-lake (stdl...) account." + value = { + id = local.data_lake_account_id + name = local.data_lake_account_name + } +} + +output "datasets_container" { + description = "Mock datasets container on the data-lake account." + value = { + id = local.datasets_container_id + name = "datasets" + } +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/variables.core.tf b/infrastructure/terraform/modules/conversion-pipeline/variables.core.tf new file mode 100644 index 00000000..dab088c2 --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/variables.core.tf @@ -0,0 +1,37 @@ +/** + * # Core Variables + * + * Standard variables consistent across all modules: environment, resource_prefix, + * instance, resource_group, and an optional location override. + */ + +variable "environment" { + type = string + description = "Environment for all resources in this module: dev, staging, or prod" +} + +variable "resource_prefix" { + type = string + description = "Prefix for all resources in this module" +} + +variable "instance" { + type = string + description = "Instance identifier for naming resources: 001, 002, etc" + default = "001" +} + +variable "resource_group" { + type = object({ + id = string + name = string + location = string + }) + description = "Resource group object containing name, id, and location" +} + +variable "location" { + type = string + description = "Override location for module resources. Defaults to var.resource_group.location when null" + default = null +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/variables.deps.tf b/infrastructure/terraform/modules/conversion-pipeline/variables.deps.tf new file mode 100644 index 00000000..c88181ba --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/variables.deps.tf @@ -0,0 +1,21 @@ +/** + * # Dependency Variables + * + * Resources provided by the platform module as typed object dependencies. + */ + +variable "data_lake_storage_account" { + type = object({ + id = string + name = string + }) + description = "Platform-owned ADLS Gen2 data-lake account (stdl...) used as the durable raw -> converted store" +} + +variable "datasets_container" { + type = object({ + id = string + name = string + }) + description = "Datasets container on the platform-owned data-lake account. Used to scope Fabric SP ACL grants to raw/ and converted/ folders" +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/variables.tf b/infrastructure/terraform/modules/conversion-pipeline/variables.tf new file mode 100644 index 00000000..10bd4e0c --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/variables.tf @@ -0,0 +1,76 @@ +/** + * # Module Variables + * + * Conversion-pipeline-specific knobs: Event Grid filters, Fabric capacity + * sizing, and downstream subscriber wiring. Storage shape, lifecycle, and + * private-endpoint inputs are intentionally absent: durable storage is owned + * by the platform module's data-lake account. + */ + +/* + * Event Grid + */ + +variable "should_enable_event_grid_dead_letter" { + type = bool + description = "Whether to enable an Event Grid dead-letter destination backed by an in-account container" + default = true +} + +variable "raw_blob_suffix_filters" { + type = list(string) + description = "Suffix filters used by the Event Grid subscription's advanced_filter.string_ends_with on the raw container" + default = [".bag", ".bag.zst", ".mcap"] + + validation { + condition = length(var.raw_blob_suffix_filters) > 0 + error_message = "raw_blob_suffix_filters must contain at least one suffix." + } +} + +variable "conversion_subscriber_url" { + type = string + description = "Optional webhook URL for the downstream conversion subscriber. When null, the subscription is created without a webhook destination (DLQ-only) until the conversion compute lands" + default = null +} + +/* + * Microsoft Fabric + */ + +variable "should_create_fabric_capacity" { + type = bool + description = "Whether to provision a new Fabric capacity" + default = true +} + +variable "should_create_fabric_workspace" { + type = bool + description = "Whether to provision a Fabric workspace bound to the Fabric capacity. The workspace's capacity_id is resolved at apply time from a deferred data \"fabric_capacity\" lookup keyed on the capacity display name" + default = true +} + +variable "fabric_capacity_sku" { + type = string + description = "SKU for the Fabric capacity (F2 through F2048). Only used when should_create_fabric_capacity is true" + default = "F2" + + validation { + condition = contains([ + "F2", "F4", "F8", "F16", "F32", "F64", "F128", "F256", "F512", "F1024", "F2048" + ], var.fabric_capacity_sku) + error_message = "fabric_capacity_sku must be one of F2, F4, F8, F16, F32, F64, F128, F256, F512, F1024, F2048." + } +} + +variable "fabric_admin_members" { + type = list(string) + description = "Entra UPNs or object IDs that should be granted Fabric capacity administration" + default = [] +} + +variable "fabric_workspace_sp_object_id" { + type = string + description = "Object ID of the Fabric workspace service principal. When provided, the SP is granted Storage Blob Data Reader on the datasets container plus an ADLS Gen2 ACL granting rwx on the converted/ folder" + default = null +} diff --git a/infrastructure/terraform/modules/conversion-pipeline/versions.tf b/infrastructure/terraform/modules/conversion-pipeline/versions.tf new file mode 100644 index 00000000..8a4d569b --- /dev/null +++ b/infrastructure/terraform/modules/conversion-pipeline/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.9.8, < 2.0" + + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = ">= 4.51.0" + } + fabric = { + source = "microsoft/fabric" + version = ">= 1.3.0" + } + } +} diff --git a/infrastructure/terraform/modules/dataviewer/TERRAFORM.md b/infrastructure/terraform/modules/dataviewer/TERRAFORM.md index b42a0f65..32325c8f 100644 --- a/infrastructure/terraform/modules/dataviewer/TERRAFORM.md +++ b/infrastructure/terraform/modules/dataviewer/TERRAFORM.md @@ -2,7 +2,7 @@ title: Dataviewer Module description: Deploys the dataviewer application on Azure Container Apps with networking, identity, and app-level resources. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/modules/platform/TERRAFORM.md b/infrastructure/terraform/modules/platform/TERRAFORM.md index 64e46018..1a84e9b5 100644 --- a/infrastructure/terraform/modules/platform/TERRAFORM.md +++ b/infrastructure/terraform/modules/platform/TERRAFORM.md @@ -2,7 +2,7 @@ title: Platform Module description: Deploys shared Azure infrastructure services for robotics ML workloads. Resources include: networking, DNS zones, security, observability, ACR, storage, ML workspace. Optional: PostgreSQL and Redis for OSMO workloads. author: Microsoft Robotics-AI Team -ms.date: 2026-04-17 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/modules/platform/outputs.tf b/infrastructure/terraform/modules/platform/outputs.tf index 222263e2..b0bb777f 100644 --- a/infrastructure/terraform/modules/platform/outputs.tf +++ b/infrastructure/terraform/modules/platform/outputs.tf @@ -165,6 +165,14 @@ output "data_lake_storage_account" { } : null } +output "datasets_container" { + description = "Datasets container on the data lake storage account. Null when data lake is disabled" + value = var.should_create_data_lake_storage ? { + id = azurerm_storage_container.datasets[0].id + name = azurerm_storage_container.datasets[0].name + } : null +} + output "data_lake_storage_account_access" { description = "Data lake storage account access credentials. Null when data lake is disabled" value = var.should_create_data_lake_storage ? { diff --git a/infrastructure/terraform/modules/sil/TERRAFORM.md b/infrastructure/terraform/modules/sil/TERRAFORM.md index 7dbd7351..b2605a96 100644 --- a/infrastructure/terraform/modules/sil/TERRAFORM.md +++ b/infrastructure/terraform/modules/sil/TERRAFORM.md @@ -2,7 +2,7 @@ title: SiL Module (Software-in-the-Loop) description: Deploys AKS-specific infrastructure for robotics ML workloads with GPU node pools, AzureML integration, and observability. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/modules/vpn/TERRAFORM.md b/infrastructure/terraform/modules/vpn/TERRAFORM.md index 2cabd029..e42dce92 100644 --- a/infrastructure/terraform/modules/vpn/TERRAFORM.md +++ b/infrastructure/terraform/modules/vpn/TERRAFORM.md @@ -2,7 +2,7 @@ title: VPN Gateway Module description: Deploys Azure VPN Gateway for Point-to-Site and Site-to-Site connectivity. Creates GatewaySubnet within the platform's virtual network. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference --- diff --git a/infrastructure/terraform/outputs.tf b/infrastructure/terraform/outputs.tf index 64408ec4..7cb57e83 100644 --- a/infrastructure/terraform/outputs.tf +++ b/infrastructure/terraform/outputs.tf @@ -196,3 +196,32 @@ output "osmo_workload_identity" { description = "OSMO workload identity for deployment scripts" value = module.platform.osmo_workload_identity } + +// ============================================================ +// Conversion Pipeline Outputs (Optional) +// ============================================================ + +output "conversion_pipeline_event_grid_topic" { + description = "Conversion pipeline Event Grid system topic. Null when conversion pipeline is disabled." + value = try(module.conversion_pipeline[0].event_grid_topic, null) +} + +output "conversion_pipeline_event_grid_subscription" { + description = "Conversion pipeline Event Grid subscription. Null when conversion pipeline is disabled." + value = try(module.conversion_pipeline[0].event_grid_subscription, null) +} + +output "conversion_pipeline_event_grid_dlq_container" { + description = "Conversion pipeline Event Grid dead-letter container. Null when DLQ is disabled or pipeline is disabled." + value = try(module.conversion_pipeline[0].event_grid_dlq_container, null) +} + +output "conversion_pipeline_fabric_workspace" { + description = "Conversion pipeline Microsoft Fabric workspace. Null when conversion pipeline is disabled." + value = try(module.conversion_pipeline[0].fabric_workspace, null) +} + +output "conversion_pipeline_fabric_capacity" { + description = "Conversion pipeline Microsoft Fabric capacity. Null when capacity is reused or pipeline is disabled." + value = try(module.conversion_pipeline[0].fabric_capacity, null) +} diff --git a/infrastructure/terraform/tests/precondition.tftest.hcl b/infrastructure/terraform/tests/precondition.tftest.hcl new file mode 100644 index 00000000..6337de10 --- /dev/null +++ b/infrastructure/terraform/tests/precondition.tftest.hcl @@ -0,0 +1,82 @@ +// Root precondition tests +// Validates that should_deploy_conversion_pipeline = true requires +// should_create_data_lake_storage = true. The check lives on a sibling +// terraform_data resource because module call blocks do not support +// lifecycle.precondition directly. + +mock_provider "azurerm" { + override_during = plan +} +mock_provider "azuread" { + override_during = plan +} +mock_provider "azapi" { + override_during = plan +} +mock_provider "msgraph" { + override_during = plan +} +mock_provider "tls" { + override_during = plan +} +mock_provider "random" { + override_during = plan +} + +override_data { + target = module.platform.data.azurerm_client_config.current + values = { + tenant_id = "00000000-0000-0000-0000-000000000000" + } +} + +// Bypass sil module count expressions that depend on platform try() outputs. +override_module { + target = module.sil + outputs = { + aks_subnets = { + aks = { + id = "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg-test/providers/Microsoft.Network/virtualNetworks/vnet-test/subnets/snet-aks" + name = "snet-aks" + } + } + aks_cluster = { + id = "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/rg-test/providers/Microsoft.ContainerService/managedClusters/aks-test" + name = "aks-test" + fqdn = "aks-test-dns.hcp.westus3.azmk8s.io" + kubelet_identity = null + node_resource_group = "MC_rg-test_aks-test_westus3" + } + aks_oidc_issuer_url = "https://westus3.oic.prod-aks.azure.com/00000000-0000-0000-0000-000000000000/" + gpu_node_pool_subnets = {} + node_pools = {} + } +} + +run "setup" { + module { + source = "./tests/setup" + } +} + +// ============================================================ +// Precondition: conversion pipeline requires platform data lake +// ============================================================ + +run "precondition_requires_data_lake" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + should_create_resource_group = true + should_deploy_conversion_pipeline = true + should_create_data_lake_storage = false + } + + expect_failures = [ + terraform_data.conversion_pipeline_precondition, + ] +} diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index 3a4f85a0..e6911ce1 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -529,3 +529,37 @@ variable "should_include_aks_dns_zone" { description = "Whether to include the AKS private DNS zone in core DNS zones" default = true } + +/* + * Conversion Pipeline Configuration - Optional + * + * The conversion pipeline module is opt-in. When should_deploy_conversion_pipeline + * is false (default), no conversion-pipeline resources are created and the + * conversion_pipeline_config object's fields go unused. When true, the module + * provisions an Event Grid system topic + subscription on the platform-owned + * data-lake account, an in-account dead-letter container, the Microsoft Fabric + * capacity + workspace, and Fabric SP RBAC. Durable storage (raw -> converted) + * lives on the platform module's data-lake account; should_create_data_lake_storage + * must be true (enforced by a precondition on the conversion-pipeline module). + */ + +variable "should_deploy_conversion_pipeline" { + type = bool + description = "Whether to deploy the conversion-pipeline module (raw -> converted ingest with Event Grid + Fabric)" + default = false +} + +variable "conversion_pipeline_config" { + type = object({ + should_enable_event_grid_dead_letter = optional(bool, true) + raw_blob_suffix_filters = optional(list(string), [".bag", ".bag.zst", ".mcap"]) + conversion_subscriber_url = optional(string, null) + should_create_fabric_capacity = optional(bool, true) + should_create_fabric_workspace = optional(bool, true) + fabric_capacity_sku = optional(string, "F2") + fabric_admin_members = optional(list(string), []) + fabric_workspace_sp_object_id = optional(string, null) + }) + description = "Conversion pipeline module configuration. Only consumed when should_deploy_conversion_pipeline is true" + default = {} +} diff --git a/infrastructure/terraform/versions.tf b/infrastructure/terraform/versions.tf index 54e518d4..9209d136 100644 --- a/infrastructure/terraform/versions.tf +++ b/infrastructure/terraform/versions.tf @@ -20,6 +20,10 @@ terraform { source = "hashicorp/tls" version = ">= 4.0.6" } + fabric = { + source = "microsoft/fabric" + version = ">= 1.3.0" + } } required_version = ">= 1.9.8, < 2.0" } diff --git a/infrastructure/terraform/vpn/TERRAFORM.md b/infrastructure/terraform/vpn/TERRAFORM.md index 79916ed3..f559a0cd 100644 --- a/infrastructure/terraform/vpn/TERRAFORM.md +++ b/infrastructure/terraform/vpn/TERRAFORM.md @@ -2,7 +2,7 @@ title: VPN Gateway Standalone Configuration description: Deploys VPN Gateway for Point-to-Site and Site-to-Site connectivity using data sources to reference existing platform infrastructure. author: Microsoft Robotics-AI Team -ms.date: 2026-04-08 +ms.date: 2026-04-28 ms.topic: reference ---