Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions evaluation/.amlignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Azure ML ignore file - uses gitignore syntax
# Place in code root to exclude files from job snapshots

# Python bytecode and cache
__pycache__/
*.py[cod]
*$py.class
*.pyc
*.pyo

# Virtual environments
.venv/
venv/
env/
.env/

# Testing and coverage
.pytest_cache/
.mypy_cache/
.coverage
htmlcov/
*.cover

# Build artifacts
*.egg-info/
dist/
build/
*.egg

# IDE and editor files
.vscode/
.idea/
*.swp
*.swo
*~

# OS files
.DS_Store
Thumbs.db

# Temp folders used in OSMO jobs
.tmp/
16 changes: 9 additions & 7 deletions evaluation/sil/scripts/submit-azureml-validation.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
# Submit Azure ML validation job using training/rl/ as the code directory
# Submit Azure ML validation job using evaluation/ as the code directory
# The .amlignore file controls which files are excluded from the code snapshot
set -o errexit -o nounset

Expand Down Expand Up @@ -40,7 +40,7 @@ VALIDATION OPTIONS:
--gui Disable headless mode

AZURE CONTEXT:
--job-file PATH Job YAML template (default: workflows/azureml/validate.yaml)
--job-file PATH Job YAML template (default: evaluation/sil/workflows/azureml/validate.yaml)
--compute TARGET Compute target override
--instance-type TYPE Instance type (default: gpuspot)
--experiment-name NAME Experiment name override
Expand Down Expand Up @@ -109,7 +109,7 @@ subscription_id="${AZURE_SUBSCRIPTION_ID:-$(get_subscription_id)}"
resource_group="${AZURE_RESOURCE_GROUP:-$(get_resource_group)}"
workspace_name="${AZUREML_WORKSPACE_NAME:-$(get_azureml_workspace)}"

job_file="$REPO_ROOT/workflows/azureml/validate.yaml"
job_file="$REPO_ROOT/evaluation/sil/workflows/azureml/validate.yaml"
compute="${AZUREML_COMPUTE:-$(get_compute_target)}"
instance_type="gpuspot"
experiment_name=""
Expand Down Expand Up @@ -166,9 +166,9 @@ if [[ -z "$model_name" ]]; then
info "Auto-derived model name: $model_name"
fi

code_path="$REPO_ROOT"
[[ -d "$code_path/training" ]] || fatal "Training source not found: $code_path/training"
[[ -f "$code_path/training/.amlignore" ]] || warn "No .amlignore found; __pycache__ may be included in snapshot"
code_path="$REPO_ROOT/evaluation"
[[ -d "$code_path/sil" ]] || fatal "SIL evaluation source not found: $code_path/sil"
[[ -f "$code_path/.amlignore" ]] || warn "No evaluation/.amlignore found; the AML snapshot may include unrelated files"

if [[ "$config_preview" == "true" ]]; then
section "Configuration Preview"
Expand Down Expand Up @@ -268,8 +268,10 @@ cmd="$cmd --success-threshold \${{inputs.success_threshold}}"

[[ "$headless" == "true" ]] && cmd="$cmd --headless"

# AML snapshots evaluation/ as the code root, so recreate the top-level evaluation path
# expected by the shell entrypoint and Python imports inside the job container.
az_args+=(
--set "command=bash training/scripts/validate.sh $cmd"
--set "command=if [ ! -e evaluation ]; then ln -s . evaluation; fi && bash evaluation/sil/validate.sh $cmd"
--set "inputs.task=${task:-auto}"
--set "inputs.framework=${framework:-auto}"
--set "inputs.success_threshold=${threshold:--1.0}"
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/terraform/terraform.tfvars.example
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ node_pools = {
// and its Public IP. To preserve existing zonal behavior, set nat_gateway_zones to your current
// zone (e.g. ["1"]) explicitly before applying. Run `terraform plan` to confirm no unintended
// NAT/Public IP recreation.
nat_gateway_zones = []
nat_gateway_zones = ["1"]

// OSMO Backend Services
should_deploy_postgresql = true
Expand Down
10 changes: 10 additions & 0 deletions infrastructure/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ variable "nat_gateway_zones" {
type = list(string)
description = "Availability zones for NAT Gateway and its public IP. Set to [\"1\"] in regions with AZ support. Leave empty for regions without AZ support (e.g. westus)"
default = ["1"]

validation {
condition = alltrue([for z in var.nat_gateway_zones : contains(["1", "2", "3"], z)])
error_message = "Each zone must be \"1\", \"2\", or \"3\""
}

validation {
condition = length(var.nat_gateway_zones) == length(distinct(var.nat_gateway_zones))
error_message = "nat_gateway_zones must not contain duplicates"
}
}

variable "should_create_vm_subnet" {
Expand Down
Loading