Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions evaluation/.amlignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Azure ML ignore file - uses gitignore syntax
# Place in code root to exclude files from job snapshots

# Python bytecode and cache
__pycache__/
*.py[cod]
*$py.class
*.pyc
*.pyo

# Virtual environments
.venv/
venv/
env/
.env/

# Testing and coverage
.pytest_cache/
.mypy_cache/
.coverage
htmlcov/
*.cover

# Build artifacts
*.egg-info/
dist/
build/
*.egg

# IDE and editor files
.vscode/
.idea/
*.swp
*.swo
*~

# OS files
.DS_Store
Thumbs.db

# Temp folders used in OSMO jobs
.tmp/
14 changes: 8 additions & 6 deletions evaluation/sil/scripts/submit-azureml-validation.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
# Submit Azure ML validation job using training/rl/ as the code directory
# Submit Azure ML validation job using evaluation/ as the code directory
# The .amlignore file controls which files are excluded from the code snapshot
set -o errexit -o nounset

Expand Down Expand Up @@ -108,7 +108,7 @@ subscription_id="${AZURE_SUBSCRIPTION_ID:-$(get_subscription_id)}"
resource_group="${AZURE_RESOURCE_GROUP:-$(get_resource_group)}"
workspace_name="${AZUREML_WORKSPACE_NAME:-$(get_azureml_workspace)}"

job_file="$REPO_ROOT/workflows/azureml/validate.yaml"
job_file="$REPO_ROOT/evaluation/sil/workflows/azureml/validate.yaml"
compute="${AZUREML_COMPUTE:-$(get_compute_target)}"
instance_type="gpuspot"
experiment_name=""
Expand Down Expand Up @@ -165,9 +165,9 @@ if [[ -z "$model_name" ]]; then
info "Auto-derived model name: $model_name"
fi

code_path="$REPO_ROOT"
[[ -d "$code_path/training" ]] || fatal "Training source not found: $code_path/training"
[[ -f "$code_path/training/.amlignore" ]] || warn "No .amlignore found; __pycache__ may be included in snapshot"
code_path="$REPO_ROOT/evaluation"
[[ -d "$code_path/sil" ]] || fatal "SIL evaluation source not found: $code_path/sil"
[[ -f "$code_path/.amlignore" ]] || warn "No evaluation/.amlignore found; the AML snapshot may include unrelated files"

if [[ "$config_preview" == "true" ]]; then
section "Configuration Preview"
Expand Down Expand Up @@ -267,8 +267,10 @@ cmd="$cmd --success-threshold \${{inputs.success_threshold}}"

[[ "$headless" == "true" ]] && cmd="$cmd --headless"

# AML snapshots evaluation/ as the code root, so recreate the top-level evaluation path
# expected by the shell entrypoint and Python imports inside the job container.
az_args+=(
--set "command=bash training/scripts/validate.sh $cmd"
--set "command=if [ ! -e evaluation ]; then ln -s . evaluation; fi && bash evaluation/sil/validate.sh $cmd"
--set "inputs.task=${task:-auto}"
--set "inputs.framework=${framework:-auto}"
--set "inputs.success_threshold=${threshold:--1.0}"
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
* and optional Azure Machine Learning integration.
*
* Architecture:
*
* - Platform Module: Shared services (networking, security, observability, ACR, storage, ML workspace)
* - SiL Module: AKS cluster with GPU node pools and ML extension integration
*/
Expand Down Expand Up @@ -80,6 +79,7 @@ module "platform" {

// Networking configuration
should_enable_nat_gateway = var.should_enable_nat_gateway
nat_gateway_zones = var.nat_gateway_zones
should_create_vm_subnet = var.should_create_vm_subnet
virtual_network_config = {
address_space = var.virtual_network_config.address_space
Expand Down
4 changes: 2 additions & 2 deletions infrastructure/terraform/modules/platform/networking.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ resource "azurerm_public_ip" "nat_gateway" {
resource_group_name = var.resource_group.name
allocation_method = "Static"
sku = "Standard"
zones = ["1"]
zones = var.nat_gateway_zones

lifecycle {
ignore_changes = [ip_tags]
Expand All @@ -100,7 +100,7 @@ resource "azurerm_nat_gateway" "main" {
resource_group_name = var.resource_group.name
sku_name = "Standard"
idle_timeout_in_minutes = 10
zones = ["1"]
zones = var.nat_gateway_zones
}

// NAT Gateway Public IP Association
Expand Down
138 changes: 138 additions & 0 deletions infrastructure/terraform/modules/platform/tests/validation.tftest.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Platform module variable validation tests
// Validates that invalid nat_gateway_zones values are rejected by validation blocks

mock_provider "azurerm" {}
mock_provider "azuread" {}
mock_provider "azapi" {}
mock_provider "random" {}

override_data {
target = data.azurerm_client_config.current
values = {
tenant_id = "00000000-0000-0000-0000-000000000000"
}
}

variables {
current_user_oid = "00000000-0000-0000-0000-000000000001"
}

run "setup" {
module {
source = "./tests/setup"
}
}

// ============================================================
// NAT Gateway Zones — Invalid Zone Value
// ============================================================

run "nat_gateway_zones_invalid_zone_rejected" {
command = plan

variables {
resource_prefix = run.setup.resource_prefix
environment = run.setup.environment
instance = run.setup.instance
location = run.setup.location
resource_group = run.setup.resource_group
current_user_oid = run.setup.current_user_oid
nat_gateway_zones = ["4"]
}

expect_failures = [var.nat_gateway_zones]
}

// ============================================================
// NAT Gateway Zones — Non-numeric Value
// ============================================================

run "nat_gateway_zones_non_numeric_rejected" {
command = plan

variables {
resource_prefix = run.setup.resource_prefix
environment = run.setup.environment
instance = run.setup.instance
location = run.setup.location
resource_group = run.setup.resource_group
current_user_oid = run.setup.current_user_oid
nat_gateway_zones = ["abc"]
}

expect_failures = [var.nat_gateway_zones]
}

// ============================================================
// NAT Gateway Zones — Duplicate Zones
// ============================================================

run "nat_gateway_zones_duplicates_rejected" {
command = plan

variables {
resource_prefix = run.setup.resource_prefix
environment = run.setup.environment
instance = run.setup.instance
location = run.setup.location
resource_group = run.setup.resource_group
current_user_oid = run.setup.current_user_oid
nat_gateway_zones = ["1", "1"]
}

expect_failures = [var.nat_gateway_zones]
}

// ============================================================
// NAT Gateway Zones — Valid Single Zone
// ============================================================

run "nat_gateway_zones_single_zone_accepted" {
command = plan

variables {
resource_prefix = run.setup.resource_prefix
environment = run.setup.environment
instance = run.setup.instance
location = run.setup.location
resource_group = run.setup.resource_group
current_user_oid = run.setup.current_user_oid
nat_gateway_zones = ["2"]
}
}

// ============================================================
// NAT Gateway Zones — Valid Multiple Zones
// ============================================================

run "nat_gateway_zones_multiple_zones_accepted" {
command = plan

variables {
resource_prefix = run.setup.resource_prefix
environment = run.setup.environment
instance = run.setup.instance
location = run.setup.location
resource_group = run.setup.resource_group
current_user_oid = run.setup.current_user_oid
nat_gateway_zones = ["1", "2", "3"]
}
}

// ============================================================
// NAT Gateway Zones — Empty List (No AZ Support)
// ============================================================

run "nat_gateway_zones_empty_accepted" {
command = plan

variables {
resource_prefix = run.setup.resource_prefix
environment = run.setup.environment
instance = run.setup.instance
location = run.setup.location
resource_group = run.setup.resource_group
current_user_oid = run.setup.current_user_oid
nat_gateway_zones = []
}
}
19 changes: 19 additions & 0 deletions infrastructure/terraform/modules/platform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,25 @@ variable "should_enable_nat_gateway" {
default = true
}

// WARNING: Changing zones on an existing deployment forces replacement of both the
// NAT Gateway and its Public IP. This causes a brief outbound connectivity interruption
// while Azure provisions new resources. Plan changes during a maintenance window.
variable "nat_gateway_zones" {
type = list(string)
description = "Availability zones for NAT Gateway and its public IP. Leave empty for regions without AZ support"
default = ["1"]

validation {
condition = alltrue([for z in var.nat_gateway_zones : contains(["1", "2", "3"], z)])
error_message = "Each zone must be \"1\", \"2\", or \"3\""
}

validation {
condition = length(var.nat_gateway_zones) == length(distinct(var.nat_gateway_zones))
error_message = "nat_gateway_zones must not contain duplicates"
}
}

variable "should_create_vm_subnet" {
type = bool
description = "Whether to create a dedicated subnet for virtual machines in the platform virtual network"
Expand Down
9 changes: 9 additions & 0 deletions infrastructure/terraform/terraform.tfvars.example
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ node_pools = {
// }
// }

// NAT Gateway Availability Zones
// Set to ["1"] in AZ-supported regions (e.g. westus3, eastus2)
// Leave empty for regions without AZ support (e.g. westus)
// NOTE: Changing zones on an existing deployment forces replacement of both the NAT Gateway
// and its Public IP. To preserve existing zonal behavior, set nat_gateway_zones to your current
// zone (e.g. ["1"]) explicitly before applying. Run `terraform plan` to confirm no unintended
// NAT/Public IP recreation.
nat_gateway_zones = []
Comment thread
kgmwang1 marked this conversation as resolved.
Outdated

// OSMO Backend Services
should_deploy_postgresql = true
should_deploy_redis = true
Expand Down
9 changes: 9 additions & 0 deletions infrastructure/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,15 @@ variable "should_enable_nat_gateway" {
default = true
}

// WARNING: Changing zones on an existing deployment forces replacement of both the
// NAT Gateway and its Public IP. This causes a brief outbound connectivity interruption
// while Azure provisions new resources. Plan changes during a maintenance window.
variable "nat_gateway_zones" {
type = list(string)
description = "Availability zones for NAT Gateway and its public IP. Set to [\"1\"] in regions with AZ support. Leave empty for regions without AZ support (e.g. westus)"
default = ["1"]
}
Comment thread
kgmwang1 marked this conversation as resolved.

variable "should_create_vm_subnet" {
type = bool
description = "Whether to create a dedicated subnet for virtual machines in the platform virtual network"
Expand Down
Loading