Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions evaluation/sil/policy_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def get_cfg(env_cfg: Any, cfg: Any) -> None:

# Load checkpoint into the runner's agent
runner.agent.load(checkpoint_path)
runner.agent.set_running_mode("eval")
runner.agent.enable_training_mode(enabled=False, apply_to_models=True)

_LOGGER.info("SKRL agent loaded and set to eval mode")
return runner.agent
Expand Down Expand Up @@ -267,7 +267,10 @@ def evaluate(env: Any, agent: Any, num_episodes: int, framework: str) -> Metrics

while metrics.count < num_episodes:
with torch.inference_mode():
actions = agent.act(obs, timestep=step, timesteps=0)[0] if framework == "skrl" else agent.act_inference(obs)
if framework == "skrl":
Comment thread
katriendg marked this conversation as resolved.
actions = agent.act(obs, inference=None, timestep=step, timesteps=0)[0]
else:
actions = agent.act_inference(obs)

obs, rewards, terminated, truncated, info = env.step(actions)
ep_rewards += rewards.squeeze()
Expand Down
58 changes: 48 additions & 10 deletions infrastructure/setup/03-deploy-osmo-control-plane.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ OPTIONS:
--force-mek Replace existing MEK (data loss warning)
--mek-config-file PATH Use existing MEK config file
--skip-service-config Skip service_base_url configuration
--service-url URL OSMO control plane URL (default: auto-detect)
--skip-preflight Skip preflight version checks
--use-local-osmo Use local osmo-dev CLI instead of production osmo
--config-preview Print configuration and exit
Expand All @@ -57,6 +58,7 @@ skip_mek=false
force_mek=false
mek_config_file=""
skip_service_config=false
service_url=""
skip_preflight=false
use_local_osmo=false
config_preview=false
Expand All @@ -78,6 +80,7 @@ while [[ $# -gt 0 ]]; do
--force-mek) force_mek=true; shift ;;
--mek-config-file) mek_config_file="$2"; shift 2 ;;
--skip-service-config) skip_service_config=true; shift ;;
--service-url) service_url="$2"; shift 2 ;;
--skip-preflight) skip_preflight=true; shift ;;
--use-local-osmo) use_local_osmo=true; shift ;;
--config-preview) config_preview=true; shift ;;
Expand Down Expand Up @@ -129,6 +132,7 @@ if [[ "$config_preview" == "true" ]]; then
print_kv "Redis" "$([[ $use_incluster_redis == true ]] && echo 'in-cluster' || echo "$redis_hostname:$redis_port")"
print_kv "ACR" "$([[ $use_acr == true ]] && echo "$acr_login_server" || echo 'nvcr.io')"
print_kv "Auth Mode" "$([[ -n $osmo_identity_client_id ]] && echo 'workload-identity' || echo 'kubectl-secrets')"
print_kv "OSMO Auth" "defaultAdmin (token-based)"
exit 0
fi

Expand Down Expand Up @@ -246,6 +250,25 @@ fi
ingress_manifest="$SCRIPT_DIR/manifests/internal-lb-ingress.yaml"
[[ -f "$ingress_manifest" ]] && kubectl apply -f "$ingress_manifest"

#------------------------------------------------------------------------------
# Read Admin Password from Key Vault
#------------------------------------------------------------------------------
section "Read Admin Password"

admin_password=$(az keyvault secret show --vault-name "$keyvault" \
--name "osmo-admin-password" --query value -o tsv 2>/dev/null) || \
fatal "OSMO admin password not found in Key Vault '$keyvault'. Run 'terraform apply' first."
info "Admin password retrieved from Key Vault"

# In non-identity mode CSI is unavailable; create the K8s secret from KV value.
# Uses --from-file with process substitution to avoid exposing the password in process args.
if [[ -z "$osmo_identity_client_id" ]]; then
kubectl create secret generic osmo-default-admin \
--namespace="$NS_OSMO_CONTROL_PLANE" \
--from-file=password=<(printf '%s' "$admin_password") \
--dry-run=client -o yaml | kubectl apply -f -
fi

#------------------------------------------------------------------------------
# Configure NGC Authentication (pre-release images)
#------------------------------------------------------------------------------
Expand Down Expand Up @@ -338,21 +361,36 @@ if [[ "$skip_service_config" == "false" ]]; then
section "Configure OSMO Service"
kubectl wait --for=condition=available deployment/osmo-service -n "$NS_OSMO_CONTROL_PLANE" --timeout=120s

service_url=$(detect_service_url)
ingress_base_url=$(detect_ingress_base_url "$NS_AZUREML")
if [[ -n "$service_url" && -n "$ingress_base_url" ]]; then
validate_service_url_reachable "$service_url"
[[ -f "$service_config_template" ]] || fatal "Service config template not found: $service_config_template"
export SERVICE_BASE_URL="$ingress_base_url"
envsubst < "$service_config_template" > "$CONFIG_DIR/out/service-config.json"
osmo_login_and_setup "$service_url"
info "Applying service configuration (service_base_url: $ingress_base_url)..."
osmo config update SERVICE --file "$CONFIG_DIR/out/service-config.json" --description "Set service base URL to ingress controller FQDN"
cluster_service_url=$(detect_service_url)
if [[ -z "$service_url" ]]; then
service_url="${cluster_service_url}"
else
[[ -z "$cluster_service_url" ]] && cluster_service_url="$service_url"
fi
if [[ -n "$service_url" ]]; then
if ! curl -sf --connect-timeout 5 "${service_url}/api/version" >/dev/null 2>&1; then
warn "OSMO service URL $service_url is not reachable from this host"
warn "SERVICE config will be set by 04-deploy-osmo-backend.sh instead"
warn "When running script 04, use: --service-url http://localhost:8080"
warn "(after: kubectl port-forward svc/osmo-service -n osmo-control-plane 8080:80 &)"
else
[[ -f "$service_config_template" ]] || fatal "Service config template not found: $service_config_template"
# SERVICE config service_base_url must use the in-cluster URL (ingress LB IP)
# so workflow pod sidecars can reach the control plane via the ingress routes.
# The user-provided --service-url (e.g. localhost port-forward) is only for CLI access.
export SERVICE_BASE_URL="${cluster_service_url:-$service_url}"
envsubst < "$service_config_template" > "$CONFIG_DIR/out/service-config.json"
osmo_login_and_setup "$service_url" "$admin_password"
info "Applying service configuration (service_base_url: $SERVICE_BASE_URL)..."
osmo config update SERVICE --file "$CONFIG_DIR/out/service-config.json" --description "Set service base URL for UI"
fi
else
warn "Could not determine service base URL - OSMO UI may show errors"
fi
fi

unset admin_password

#------------------------------------------------------------------------------
# Summary
#------------------------------------------------------------------------------
Expand Down
66 changes: 50 additions & 16 deletions infrastructure/setup/04-deploy-osmo-backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,20 @@ if [[ -n "$custom_expiry" ]]; then
expiry_date=$(date -u -j -f "%Y-%m-%d" "$custom_expiry" +%F 2>/dev/null) || \
fatal "--expires-at must be YYYY-MM-DD format"
else
expiry_date=$(date -u -d "+1 year" +%F 2>/dev/null) || \
expiry_date=$(date -u -v+1y +%F 2>/dev/null) || \
expiry_date=$(date -u -d "+90 days" +%F 2>/dev/null) || \
expiry_date=$(date -u -v+90d +%F 2>/dev/null) || \
fatal "Unable to compute token expiry date"
fi

# Storage access key (only when using access-keys mode)
account_key=""
[[ "$use_access_keys" == "true" ]] && account_key=$(az storage account keys list -g "$rg" -n "$storage_name" --query '[0].value' -o tsv)
# Storage connection string (only when using access-keys mode)
# OSMO's StaticDataCredential passes access_key to BlobServiceClient.from_connection_string(),
# which requires a full connection string — not a raw access key.
storage_connection_string=""
if [[ "$use_access_keys" == "true" ]]; then
raw_key=$(az storage account keys list -g "$rg" -n "$storage_name" --query '[0].value' -o tsv)
Comment thread
katriendg marked this conversation as resolved.
storage_connection_string="DefaultEndpointsProtocol=https;AccountName=${storage_name};AccountKey=${raw_key};EndpointSuffix=core.windows.net"
unset raw_key
fi

auth_mode="workload-identity"
[[ "$use_access_keys" == "true" ]] && auth_mode="access-keys"
Expand All @@ -182,6 +188,7 @@ workflow_template="$CONFIG_DIR/${WORKFLOW_TEMPLATE}"
if [[ "$config_preview" == "true" ]]; then
section "Configuration Preview"
print_kv "Service URL" "$service_url"
print_kv "Service Base URL" "$cluster_service_url"
print_kv "Backend Name" "$backend_name"
print_kv "Chart Version" "$chart_version"
print_kv "Image Version" "$image_version"
Expand Down Expand Up @@ -224,7 +231,26 @@ mkdir -p "$CONFIG_DIR/out"
# OSMO Login
#------------------------------------------------------------------------------
section "OSMO Login"
osmo_login_and_setup "$service_url"

admin_secret_name="osmo-default-admin"
admin_password=$(kubectl get secret "$admin_secret_name" \
-n "$NS_OSMO_CONTROL_PLANE" \
-o jsonpath='{.data.password}' | base64 -d) || \
fatal "Admin secret $admin_secret_name not found in $NS_OSMO_CONTROL_PLANE. Run 03-deploy-osmo-control-plane.sh first."

osmo_login_and_setup "$service_url" "$admin_password"

# Verify SERVICE config service_base_url is set — may be empty if script 03
# ran from a devcontainer and the config update was skipped or failed.
# Workflow pod sidecars use this value as their -host argument; empty causes
# websocket connection failures (ws://:80/...).
current_base_url=$(osmo config show SERVICE 2>/dev/null | jq -r '.service_base_url // empty' || true)
if [[ -z "$current_base_url" ]]; then
warn "SERVICE config service_base_url is empty — workflow sidecars cannot reach the control plane"
info "Setting service_base_url to in-cluster URL: $cluster_service_url"
printf '{"service_base_url": "%s"}' "$cluster_service_url" | \
osmo config update SERVICE --file /dev/stdin --description "Set service base URL for workflow sidecar connectivity"
fi

#------------------------------------------------------------------------------
# Prepare Namespaces and Service Token
Expand All @@ -238,27 +264,34 @@ token_exists=false
kubectl get secret "$account_secret" -n "$NS_OSMO_OPERATOR" &>/dev/null && token_exists=true

if [[ "$regenerate_token" == "true" || "$token_exists" == "false" ]]; then
info "Ensuring backend-operator service account..."
osmo_login_and_setup "$service_url" "$admin_password" "backend-operator" "osmo-backend"

token_name="backend-token-$(date -u +%Y%m%d%H%M%S)"
info "Generating OSMO service token $token_name (expires $expiry_date)..."

# Create service access token via OSMO API (works with auth.enabled=false via x-osmo-user header)
OSMO_SERVICE_TOKEN=$(curl -sf -X POST \
"${service_url}/api/auth/access_token/service/${token_name}?expires_at=${expiry_date}&roles=osmo-backend" \
-H "x-osmo-user: admin")

# Strip surrounding quotes from JSON string response
OSMO_SERVICE_TOKEN="${OSMO_SERVICE_TOKEN//\"/}"
[[ -z "$OSMO_SERVICE_TOKEN" ]] && fatal "Failed to obtain service token from ${service_url}"
token_json=$(osmo token set "$token_name" \
--user backend-operator \
--expires-at "$expiry_date" \
--description "Backend Operator - $(date -u +%F)" \
--roles osmo-backend \
-t json 2>/dev/null) || fatal "Failed to create service token via OSMO CLI"
OSMO_SERVICE_TOKEN=$(printf '%s' "$token_json" | jq -r '.token // empty')
[[ -z "$OSMO_SERVICE_TOKEN" ]] && fatal "Service token response missing 'token' field"
export OSMO_SERVICE_TOKEN

kubectl create secret generic "$account_secret" \
--namespace="$NS_OSMO_OPERATOR" \
--from-literal=token="$OSMO_SERVICE_TOKEN" \
--from-file=token=<(printf '%s' "$OSMO_SERVICE_TOKEN") \
--dry-run=client -o yaml | kubectl apply -f - >/dev/null

unset OSMO_SERVICE_TOKEN
else
info "Token secret $account_secret already exists"
fi

unset admin_password

#------------------------------------------------------------------------------
# Configure Storage Container
#------------------------------------------------------------------------------
Expand Down Expand Up @@ -364,7 +397,7 @@ export BACKEND_DESCRIPTION="$backend_description"
export K8S_NAMESPACE="$NS_OSMO_WORKFLOWS"
export CONTROL_PLANE_NAMESPACE="$NS_OSMO_CONTROL_PLANE"
export STORAGE_ACCESS_KEY_ID="osmo-control-plane-storage"
export STORAGE_ACCESS_KEY="$account_key"
export STORAGE_ACCESS_KEY="$storage_connection_string"
export WORKFLOW_BASE_URL="$workflow_base_url"
export WORKFLOW_DATA_ENDPOINT="${azure_container}/workflows/data"
export WORKFLOW_LOG_ENDPOINT="${azure_container}/workflows/logs"
Expand Down Expand Up @@ -535,6 +568,7 @@ fi
section "Deployment Summary"
print_kv "Backend Name" "$backend_name"
print_kv "Service URL" "$service_url"
print_kv "Service Base URL" "$cluster_service_url"
print_kv "Chart Version" "$chart_version"
print_kv "Image Version" "$image_version"
print_kv "Storage Account" "$storage_name"
Expand Down
27 changes: 8 additions & 19 deletions infrastructure/setup/cleanup/uninstall-osmo-control-plane.sh
Original file line number Diff line number Diff line change
Expand Up @@ -186,30 +186,19 @@ if [[ "$purge_postgres" == "true" ]]; then
if [[ -z "$pg_password" ]]; then
warn "Could not retrieve PostgreSQL password, skipping..."
else
warn "Dropping all OSMO tables from database '$db_name'..."

# OSMO tables in dependency order (children first, then parents)
osmo_tables=(
"collection" "dataset_tag" "dataset_version" "dataset"
"credential" "access_token" "profile" "config_history" "backend_tests"
"resource_platforms" "resources" "app_versions" "apps"
"task_io" "tasks" "groups" "workflow_tags" "workflows"
"pools" "pod_templates" "resource_validations" "backends" "roles" "configs" "ueks"
)

drop_sql="SET client_min_messages TO WARNING;"
for t in "${osmo_tables[@]}"; do
drop_sql+="DROP TABLE IF EXISTS $t CASCADE;"
done
drop_sql+="DROP TYPE IF EXISTS credential_type CASCADE;"
drop_sql+="DROP FUNCTION IF EXISTS jsonb_recursive_merge(jsonb, jsonb) CASCADE;"
warn "Dropping all tables from database '$db_name' (public schema)..."

# Drop the entire public schema and recreate it. This removes all tables,
# views, sequences, types, and functions owned by OSMO in one shot, avoiding
# a hand-maintained table list that drifts across OSMO releases.
drop_sql=$'SET client_min_messages TO WARNING;\nDROP SCHEMA IF EXISTS public CASCADE;\nCREATE SCHEMA public;\nGRANT ALL ON SCHEMA public TO PUBLIC;'

if PGPASSWORD="$pg_password" psql \
"host=$pg_fqdn port=5432 dbname=$db_name user=$pg_user sslmode=require" \
-c "$drop_sql" 2>/dev/null; then
info "PostgreSQL tables dropped"
info "PostgreSQL public schema dropped and recreated"
else
warn "Failed to drop PostgreSQL tables"
warn "Failed to drop PostgreSQL schema"
fi
fi
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"description": "Dataset storage using access keys",
"mode": "read-write",
"default_credential": {
"endpoint": "azure://${STORAGE_ACCOUNT_NAME}",
"access_key_id": "${STORAGE_ACCESS_KEY_ID}",
"access_key": "${STORAGE_ACCESS_KEY}"
}
Expand Down
11 changes: 6 additions & 5 deletions infrastructure/setup/defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# Helm Chart Versions
GPU_OPERATOR_VERSION="${GPU_OPERATOR_VERSION:-v25.3.4}"
KAI_SCHEDULER_VERSION="${KAI_SCHEDULER_VERSION:-v0.5.5}"
OSMO_CHART_VERSION="${OSMO_CHART_VERSION:-1.0.1}"
OSMO_IMAGE_VERSION="${OSMO_IMAGE_VERSION:-6.0.0}"
OSMO_PRERELEASE_CHART_VERSION="${OSMO_PRERELEASE_CHART_VERSION:-1.1.0}"
OSMO_PRERELEASE_IMAGE_VERSION="${OSMO_PRERELEASE_IMAGE_VERSION:-v2026.02.22}"
OSMO_CLI_VERSION="${OSMO_CLI_VERSION:-6.2.10}"
OSMO_CHART_VERSION="${OSMO_CHART_VERSION:-1.2.1}"
OSMO_IMAGE_VERSION="${OSMO_IMAGE_VERSION:-6.2}"
OSMO_PRERELEASE_CHART_VERSION="${OSMO_PRERELEASE_CHART_VERSION:-1.2.1}"
OSMO_PRERELEASE_IMAGE_VERSION="${OSMO_PRERELEASE_IMAGE_VERSION:-6.2}"
OSMO_USE_PRERELEASE="${OSMO_USE_PRERELEASE:-false}"
NGC_API_KEY="${NGC_API_KEY:-}"
NVCR_PULL_SECRET="${NVCR_PULL_SECRET:-nvcr-pull-secret}"
Expand Down Expand Up @@ -41,7 +42,7 @@ HELM_REPO_OSMO="${HELM_REPO_OSMO:-https://helm.ngc.nvidia.com/nvidia/osmo}"
# Leave empty to skip verification for a given chart.
GPU_OPERATOR_CHART_SHA256="${GPU_OPERATOR_CHART_SHA256:-dc719b83ceb7f04306e12972e40830993a3372cfcc62f82e7dd9588beb38c950}"
KAI_SCHEDULER_CHART_SHA256="${KAI_SCHEDULER_CHART_SHA256:-a28ba9c6ec76baa3ca8b9ad6fb5a9fe3aa834f948c02c4f9056ced7cbd097e57}"
OSMO_CHART_SHA256="${OSMO_CHART_SHA256:-9d106b1d6e58fe9120d6541ea4098d4ffe905fd91f9e683188b454cc0e54d19d}"
OSMO_CHART_SHA256="${OSMO_CHART_SHA256:-b0a3b70d24f0749c15fa3bf775416334a150ad9eb41ad4ec8f8af89114bee2b1}"

# Default Terraform Directory (relative to infrastructure/setup)
DEFAULT_TF_DIR="${DEFAULT_TF_DIR:-../terraform}"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SecretProviderClass for Azure Key Vault secrets sync via CSI driver
# Syncs psql-admin-password and redis-primary-key from Key Vault to Kubernetes secrets.
# Syncs psql-admin-password, redis-primary-key, and osmo-admin-password from Key Vault to Kubernetes secrets.
#
# Deployed by: 03-deploy-osmo-control-plane.sh (via common.sh apply_secret_provider_class)
# Creates: db-secret (db-password key), redis-secret (redis-password key)
# Creates: db-secret (db-password key), redis-secret (redis-password key), osmo-default-admin (password key)
#
# Secrets are created when pods mount the CSI volume (configured in osmo-control-plane-identity.yaml).
#
Expand All @@ -29,6 +29,9 @@ spec:
- |
objectName: redis-primary-key
objectType: secret
- |
objectName: osmo-admin-password
objectType: secret
secretObjects:
- secretName: db-secret
type: Opaque
Expand All @@ -40,3 +43,8 @@ spec:
data:
- objectName: redis-primary-key
key: redis-password
- secretName: osmo-default-admin
type: Opaque
data:
- objectName: osmo-admin-password
key: password
12 changes: 10 additions & 2 deletions infrastructure/setup/manifests/aks-secret-provider-class.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SecretProviderClass for Azure Key Vault secrets sync via CSI driver
# Syncs psql-admin-password from Key Vault to a Kubernetes secret.
# Syncs psql-admin-password and osmo-admin-password from Key Vault to Kubernetes secrets.
#
# Deployed by: 03-deploy-osmo-control-plane.sh (via common.sh apply_secret_provider_class)
# Creates: db-secret (db-password key)
# Creates: db-secret (db-password key), osmo-default-admin (password key)
#
# Secrets are created when pods mount the CSI volume (configured in osmo-control-plane-identity.yaml).
#
Expand All @@ -26,9 +26,17 @@ spec:
- |
objectName: psql-admin-password
objectType: secret
- |
objectName: osmo-admin-password
objectType: secret
secretObjects:
- secretName: db-secret
type: Opaque
data:
- objectName: psql-admin-password
key: db-password
- secretName: osmo-default-admin
type: Opaque
data:
- objectName: osmo-admin-password
key: password
2 changes: 1 addition & 1 deletion infrastructure/setup/values/osmo-backend-operator.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
global:
osmoImageTag: "6.0.0"
osmoImageTag: "6.2"
serviceUrl: "http://osmo-agent.osmo-control-plane.svc.cluster.local"
agentNamespace: "osmo-operator"
backendNamespace: "osmo-workflows"
Expand Down
Loading
Loading