diff --git a/evaluation/.amlignore b/evaluation/.amlignore new file mode 100644 index 00000000..8508433e --- /dev/null +++ b/evaluation/.amlignore @@ -0,0 +1,42 @@ +# Azure ML ignore file - uses gitignore syntax +# Place in code root to exclude files from job snapshots + +# Python bytecode and cache +__pycache__/ +*.py[cod] +*$py.class +*.pyc +*.pyo + +# Virtual environments +.venv/ +venv/ +env/ +.env/ + +# Testing and coverage +.pytest_cache/ +.mypy_cache/ +.coverage +htmlcov/ +*.cover + +# Build artifacts +*.egg-info/ +dist/ +build/ +*.egg + +# IDE and editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db + +# Temp folders used in OSMO jobs +.tmp/ diff --git a/evaluation/sil/scripts/submit-azureml-validation.sh b/evaluation/sil/scripts/submit-azureml-validation.sh index 4977df1b..d7dbe0a1 100755 --- a/evaluation/sil/scripts/submit-azureml-validation.sh +++ b/evaluation/sil/scripts/submit-azureml-validation.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Submit Azure ML validation job using training/rl/ as the code directory +# Submit Azure ML validation job using evaluation/ as the code directory # The .amlignore file controls which files are excluded from the code snapshot set -o errexit -o nounset @@ -40,7 +40,7 @@ VALIDATION OPTIONS: --gui Disable headless mode AZURE CONTEXT: - --job-file PATH Job YAML template (default: workflows/azureml/validate.yaml) + --job-file PATH Job YAML template (default: evaluation/sil/workflows/azureml/validate.yaml) --compute TARGET Compute target override --instance-type TYPE Instance type (default: gpuspot) --experiment-name NAME Experiment name override @@ -109,7 +109,7 @@ subscription_id="${AZURE_SUBSCRIPTION_ID:-$(get_subscription_id)}" resource_group="${AZURE_RESOURCE_GROUP:-$(get_resource_group)}" workspace_name="${AZUREML_WORKSPACE_NAME:-$(get_azureml_workspace)}" -job_file="$REPO_ROOT/workflows/azureml/validate.yaml" +job_file="$REPO_ROOT/evaluation/sil/workflows/azureml/validate.yaml" compute="${AZUREML_COMPUTE:-$(get_compute_target)}" instance_type="gpuspot" experiment_name="" @@ -166,9 +166,9 @@ if [[ -z "$model_name" ]]; then info "Auto-derived model name: $model_name" fi -code_path="$REPO_ROOT" -[[ -d "$code_path/training" ]] || fatal "Training source not found: $code_path/training" -[[ -f "$code_path/training/.amlignore" ]] || warn "No .amlignore found; __pycache__ may be included in snapshot" +code_path="$REPO_ROOT/evaluation" +[[ -d "$code_path/sil" ]] || fatal "SIL evaluation source not found: $code_path/sil" +[[ -f "$code_path/.amlignore" ]] || warn "No evaluation/.amlignore found; the AML snapshot may include unrelated files" if [[ "$config_preview" == "true" ]]; then section "Configuration Preview" @@ -268,8 +268,10 @@ cmd="$cmd --success-threshold \${{inputs.success_threshold}}" [[ "$headless" == "true" ]] && cmd="$cmd --headless" +# AML snapshots evaluation/ as the code root, so recreate the top-level evaluation path +# expected by the shell entrypoint and Python imports inside the job container. az_args+=( - --set "command=bash training/scripts/validate.sh $cmd" + --set "command=if [ ! -e evaluation ]; then ln -s . evaluation; fi && bash evaluation/sil/validate.sh $cmd" --set "inputs.task=${task:-auto}" --set "inputs.framework=${framework:-auto}" --set "inputs.success_threshold=${threshold:--1.0}" diff --git a/infrastructure/terraform/terraform.tfvars.example b/infrastructure/terraform/terraform.tfvars.example index 9e2ef0b6..fa5f8b4e 100644 --- a/infrastructure/terraform/terraform.tfvars.example +++ b/infrastructure/terraform/terraform.tfvars.example @@ -92,7 +92,7 @@ node_pools = { // and its Public IP. To preserve existing zonal behavior, set nat_gateway_zones to your current // zone (e.g. ["1"]) explicitly before applying. Run `terraform plan` to confirm no unintended // NAT/Public IP recreation. -nat_gateway_zones = [] +nat_gateway_zones = ["1"] // OSMO Backend Services should_deploy_postgresql = true diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index 1e4c4977..ddb56526 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -288,6 +288,16 @@ variable "nat_gateway_zones" { type = list(string) description = "Availability zones for NAT Gateway and its public IP. Set to [\"1\"] in regions with AZ support. Leave empty for regions without AZ support (e.g. westus)" default = ["1"] + + validation { + condition = alltrue([for z in var.nat_gateway_zones : contains(["1", "2", "3"], z)]) + error_message = "Each zone must be \"1\", \"2\", or \"3\"" + } + + validation { + condition = length(var.nat_gateway_zones) == length(distinct(var.nat_gateway_zones)) + error_message = "nat_gateway_zones must not contain duplicates" + } } variable "should_create_vm_subnet" {