diff --git a/infrastructure/terraform/README.md b/infrastructure/terraform/README.md index fe1d148d..c29b6bdb 100644 --- a/infrastructure/terraform/README.md +++ b/infrastructure/terraform/README.md @@ -31,6 +31,14 @@ cp terraform.tfvars.example terraform.tfvars terraform init && terraform apply ``` +## ⚙️ Optional AML diagnostics + +Set `should_enable_aml_diagnostic_logs = true` in `terraform.tfvars` to create an AML workspace diagnostic setting that sends all AML resource logs to the platform Log Analytics workspace. The default is `false`. + +```hcl +should_enable_aml_diagnostic_logs = true +``` + ## 📖 Documentation | Guide | Description | diff --git a/infrastructure/terraform/TERRAFORM.md b/infrastructure/terraform/TERRAFORM.md index 281a819f..6a99e122 100644 --- a/infrastructure/terraform/TERRAFORM.md +++ b/infrastructure/terraform/TERRAFORM.md @@ -75,6 +75,7 @@ Architecture: | should\_add\_current\_user\_storage\_blob | Whether to add the current user as Storage Blob Data Contributor | `bool` | `true` | no | | should\_create\_resource\_group | Whether to create the resource group for the robotics infrastructure | `bool` | `true` | no | | should\_deploy\_aml\_compute | Whether to deploy an AzureML managed compute cluster for GPU workloads | `bool` | `false` | no | +| should\_enable\_aml\_diagnostic\_logs | Whether to enable AML workspace diagnostic logs in Log Analytics | `bool` | `false` | no | | should\_deploy\_ampls | Whether to deploy Azure Monitor Private Link Scope and its private endpoint | `bool` | `true` | no | | should\_deploy\_dce | Whether to deploy Data Collection Endpoint for observability | `bool` | `true` | no | | should\_deploy\_grafana | Whether to deploy Azure Managed Grafana dashboard | `bool` | `true` | no | diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 7d4f257f..415b820c 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -134,8 +134,9 @@ module "platform" { should_deploy_dce = var.should_deploy_dce // AzureML compute - should_deploy_aml_compute = var.should_deploy_aml_compute - aml_compute_config = var.aml_compute_config + should_enable_aml_diagnostic_logs = var.should_enable_aml_diagnostic_logs + should_deploy_aml_compute = var.should_deploy_aml_compute + aml_compute_config = var.aml_compute_config // DNS zone flags should_include_aks_dns_zone = var.should_include_aks_dns_zone diff --git a/infrastructure/terraform/modules/platform/TERRAFORM.md b/infrastructure/terraform/modules/platform/TERRAFORM.md index 568fc9ef..39166704 100644 --- a/infrastructure/terraform/modules/platform/TERRAFORM.md +++ b/infrastructure/terraform/modules/platform/TERRAFORM.md @@ -44,6 +44,7 @@ Optional: PostgreSQL and Redis for OSMO workloads. | [azurerm_machine_learning_compute_cluster.gpu](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/machine_learning_compute_cluster) | resource | | [azurerm_managed_redis.main](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/managed_redis) | resource | | [azurerm_monitor_data_collection_endpoint.main](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_data_collection_endpoint) | resource | +| [azurerm_monitor_diagnostic_setting.ml_workspace_logs](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_diagnostic_setting) | resource | | [azurerm_monitor_private_link_scope.main](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_private_link_scope) | resource | | [azurerm_monitor_private_link_scoped_service.ai](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_private_link_scoped_service) | resource | | [azurerm_monitor_private_link_scoped_service.dce](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_private_link_scoped_service) | resource | @@ -118,6 +119,7 @@ Optional: PostgreSQL and Redis for OSMO workloads. | should\_add\_current\_user\_key\_vault\_admin | Whether to add the current user as Key Vault Secrets Officer | `bool` | `true` | no | | should\_add\_current\_user\_storage\_blob | Whether to add the current user as Storage Blob Data Contributor | `bool` | `true` | no | | should\_deploy\_aml\_compute | Whether to deploy an AzureML managed compute cluster for GPU workloads | `bool` | `false` | no | +| should\_enable\_aml\_diagnostic\_logs | Whether to enable AML workspace diagnostic logs in Log Analytics | `bool` | `false` | no | | should\_deploy\_ampls | Whether to deploy Azure Monitor Private Link Scope and its private endpoint | `bool` | `true` | no | | should\_deploy\_dce | Whether to deploy Data Collection Endpoint for observability | `bool` | `true` | no | | should\_deploy\_grafana | Whether to deploy Azure Managed Grafana dashboard | `bool` | `true` | no | diff --git a/infrastructure/terraform/modules/platform/azureml.tf b/infrastructure/terraform/modules/platform/azureml.tf index 7efc6437..2bddb897 100644 --- a/infrastructure/terraform/modules/platform/azureml.tf +++ b/infrastructure/terraform/modules/platform/azureml.tf @@ -89,6 +89,23 @@ resource "azurerm_private_endpoint" "azureml_api" { } } +resource "azurerm_monitor_diagnostic_setting" "ml_workspace_logs" { + count = var.should_enable_aml_diagnostic_logs ? 1 : 0 + + name = "diag-mlw-${local.resource_name_suffix}" + target_resource_id = azapi_resource.ml_workspace.id + log_analytics_workspace_id = azurerm_log_analytics_workspace.main.id + + enabled_log { + category_group = "allLogs" + } + + metric { + category = "AllMetrics" + enabled = false + } +} + // ============================================================ // AzureML Managed Compute Cluster (Optional) // ============================================================ diff --git a/infrastructure/terraform/modules/platform/tests/conditionals.tftest.hcl b/infrastructure/terraform/modules/platform/tests/conditionals.tftest.hcl index 28259014..ed79b873 100644 --- a/infrastructure/terraform/modules/platform/tests/conditionals.tftest.hcl +++ b/infrastructure/terraform/modules/platform/tests/conditionals.tftest.hcl @@ -601,6 +601,58 @@ run "osmo_identity_disabled" { } } +// ============================================================ +// AML Diagnostic Logs Conditional +// ============================================================ + +run "aml_diagnostic_logs_enabled" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + current_user_oid = run.setup.current_user_oid + should_enable_aml_diagnostic_logs = true + } + + assert { + condition = length(azurerm_monitor_diagnostic_setting.ml_workspace_logs) == 1 + error_message = "AML diagnostic setting should be created when enabled" + } + + assert { + condition = azurerm_monitor_diagnostic_setting.ml_workspace_logs[0].name == "diag-mlw-${run.setup.resource_prefix}-${run.setup.environment}-${run.setup.instance}" + error_message = "AML diagnostic setting should use the standard diagnostic setting name" + } + + assert { + condition = one(azurerm_monitor_diagnostic_setting.ml_workspace_logs[0].enabled_log).category_group == "allLogs" + error_message = "AML diagnostic setting should enable all AML log categories" + } +} + +run "aml_diagnostic_logs_disabled" { + command = plan + + variables { + resource_prefix = run.setup.resource_prefix + environment = run.setup.environment + instance = run.setup.instance + location = run.setup.location + resource_group = run.setup.resource_group + current_user_oid = run.setup.current_user_oid + should_enable_aml_diagnostic_logs = false + } + + assert { + condition = length(azurerm_monitor_diagnostic_setting.ml_workspace_logs) == 0 + error_message = "AML diagnostic setting should not be created when disabled" + } +} + // ============================================================ // AML Compute Conditional // ============================================================ diff --git a/infrastructure/terraform/modules/platform/tests/defaults.tftest.hcl b/infrastructure/terraform/modules/platform/tests/defaults.tftest.hcl index fd518bb9..63597246 100644 --- a/infrastructure/terraform/modules/platform/tests/defaults.tftest.hcl +++ b/infrastructure/terraform/modules/platform/tests/defaults.tftest.hcl @@ -89,6 +89,12 @@ run "verify_defaults" { error_message = "AML compute cluster should NOT be created by default" } + // AML diagnostic logs NOT enabled by default + assert { + condition = length(azurerm_monitor_diagnostic_setting.ml_workspace_logs) == 0 + error_message = "AML diagnostic setting should NOT be created by default" + } + // OSMO identity enabled by default assert { condition = length(azurerm_user_assigned_identity.osmo) == 1 diff --git a/infrastructure/terraform/modules/platform/variables.tf b/infrastructure/terraform/modules/platform/variables.tf index 37460917..d30af25f 100644 --- a/infrastructure/terraform/modules/platform/variables.tf +++ b/infrastructure/terraform/modules/platform/variables.tf @@ -282,6 +282,12 @@ variable "should_deploy_dce" { * AzureML Compute Configuration */ +variable "should_enable_aml_diagnostic_logs" { + type = bool + description = "Whether to enable AML workspace diagnostic logs in Log Analytics" + default = false +} + variable "should_deploy_aml_compute" { type = bool description = "Whether to deploy an AzureML managed compute cluster for GPU workloads" diff --git a/infrastructure/terraform/terraform.tfvars.example b/infrastructure/terraform/terraform.tfvars.example index 235e80e8..302ed123 100644 --- a/infrastructure/terraform/terraform.tfvars.example +++ b/infrastructure/terraform/terraform.tfvars.example @@ -134,6 +134,7 @@ reports_archive_tier_days = 180 // should_deploy_monitor_workspace = true // should_deploy_ampls = true // should_deploy_dce = true +// should_enable_aml_diagnostic_logs = false // AzureML Managed Compute (disabled by default) // Enable to provision GPU compute directly in the AzureML workspace. diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index 9a6caf1c..d13b6a0f 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -468,6 +468,12 @@ variable "should_deploy_dce" { * AzureML Compute Configuration - Optional */ +variable "should_enable_aml_diagnostic_logs" { + type = bool + description = "Whether to enable AML workspace diagnostic logs in Log Analytics" + default = false +} + variable "should_deploy_aml_compute" { type = bool description = "Whether to deploy an AzureML managed compute cluster for GPU workloads"