diff --git a/.github/workflows/launch_infrastructure.yaml b/.github/workflows/launch_infrastructure.yaml index 4e9a0705..dab4baee 100644 --- a/.github/workflows/launch_infrastructure.yaml +++ b/.github/workflows/launch_infrastructure.yaml @@ -29,7 +29,7 @@ jobs: stage: server paths: applications/ensemble_manager/** - application: tide - stage: model-runner + stage: runner paths: models/** steps: - name: Checkout code diff --git a/infrastructure/__main__.py b/infrastructure/__main__.py index 3134fb8e..802185f4 100644 --- a/infrastructure/__main__.py +++ b/infrastructure/__main__.py @@ -12,8 +12,8 @@ model_artifacts_bucket, portfolio_manager_image_uri, portfolio_manager_repository, - tide_model_runner_image_uri, - tide_model_runner_repository, + tide_runner_image_uri, + tide_runner_repository, ) from training import models_cluster, tide_trainer_task_definition @@ -44,15 +44,24 @@ pulumi.export("aws_alb_dns_name", alb.dns_name) pulumi.export("aws_alb_url", pulumi.Output.concat(protocol, alb.dns_name)) pulumi.export("aws_service_discovery_namespace", service_discovery_namespace.name) -pulumi.export("aws_ecr_data_manager_image", data_manager_image_uri) -pulumi.export("aws_ecr_portfolio_manager_image", portfolio_manager_image_uri) -pulumi.export("aws_ecr_ensemble_manager_image", ensemble_manager_image_uri) -pulumi.export("aws_ecr_data_manager_repository", data_manager_repository.repository_url) +pulumi.export("aws_ecr_applications_data_manager_server_image", data_manager_image_uri) pulumi.export( - "aws_ecr_portfolio_manager_repository", portfolio_manager_repository.repository_url + "aws_ecr_applications_portfolio_manager_server_image", portfolio_manager_image_uri ) pulumi.export( - "aws_ecr_ensemble_manager_repository", ensemble_manager_repository.repository_url + "aws_ecr_applications_ensemble_manager_server_image", ensemble_manager_image_uri +) +pulumi.export( + "aws_ecr_applications_data_manager_server_repository", + data_manager_repository.repository_url, +) +pulumi.export( + "aws_ecr_applications_portfolio_manager_server_repository", + portfolio_manager_repository.repository_url, +) +pulumi.export( + "aws_ecr_applications_ensemble_manager_server_repository", + ensemble_manager_repository.repository_url, ) pulumi.export("aws_s3_data_bucket_name", pulumi.Output.unsecret(data_bucket.bucket)) pulumi.export( @@ -60,10 +69,10 @@ pulumi.Output.unsecret(model_artifacts_bucket.bucket), ) pulumi.export( - "aws_ecr_tide_model_runner_repository", - tide_model_runner_repository.repository_url, + "aws_ecr_models_tide_runner_repository", + tide_runner_repository.repository_url, ) -pulumi.export("aws_ecr_tide_model_runner_image", tide_model_runner_image_uri) +pulumi.export("aws_ecr_models_tide_runner_image", tide_runner_image_uri) pulumi.export( "aws_iam_github_actions_infrastructure_role_arn", github_actions_infrastructure_role.arn, diff --git a/infrastructure/notifications.py b/infrastructure/notifications.py index 3d176a21..0d17f439 100644 --- a/infrastructure/notifications.py +++ b/infrastructure/notifications.py @@ -1,3 +1,6 @@ +import json + +import pulumi import pulumi_aws as aws from config import ( account_id, @@ -23,6 +26,48 @@ endpoint=notification_email_address, ) +cost_anomaly_monitor = aws.costexplorer.AnomalyMonitor( + "cost_anomaly_monitor", + name="fund-cost-anomaly-monitor", + monitor_type="CUSTOM", + monitor_specification=json.dumps( + { + "Dimensions": { + "Key": "LINKED_ACCOUNT", + "Values": [account_id], + "MatchOptions": ["EQUALS"], + } + } + ), + tags=tags, +) + +aws.costexplorer.AnomalySubscription( + "cost_anomaly_subscription", + name="fund-cost-anomaly-subscription", + monitor_arn_lists=[cost_anomaly_monitor.arn], + frequency="IMMEDIATE", + threshold_expression=json.dumps( + { + "Dimensions": { + "Key": "ANOMALY_TOTAL_IMPACT_ABSOLUTE", + "Values": ["25"], + "MatchOptions": ["GREATER_THAN_OR_EQUAL"], + } + } + ), + subscribers=pulumi.Output.from_input(budget_alert_email_addresses).apply( + lambda emails: [ + aws.costexplorer.AnomalySubscriptionSubscriberArgs( + address=email, + type="EMAIL", + ) + for email in emails + ] + ), + tags=tags, +) + # This can be updated by setting the monthlyBudgetLimitUsd Pulumi configuration # variable. aws.budgets.Budget( diff --git a/infrastructure/storage.py b/infrastructure/storage.py index da650822..49c3ed60 100644 --- a/infrastructure/storage.py +++ b/infrastructure/storage.py @@ -17,7 +17,18 @@ "countNumber": 1, }, "action": {"type": "expire"}, - } + }, + { + "rulePriority": 2, + "description": "Keep last 10 tagged images", + "selection": { + "tagStatus": "tagged", + "tagPatternList": ["git-*"], + "countType": "imageCountMoreThan", + "countNumber": 10, + }, + "action": {"type": "expire"}, + }, ] } ) @@ -108,7 +119,7 @@ # retain_on_delete=True and add pulumi import statements to the maskfile up command. data_manager_repository = aws.ecr.Repository( "data_manager_repository", - name="fund/data-manager-server", + name="fund/applications-data-manager-server", image_tag_mutability="MUTABLE", force_delete=True, image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( @@ -125,7 +136,7 @@ portfolio_manager_repository = aws.ecr.Repository( "portfolio_manager_repository", - name="fund/portfolio-manager-server", + name="fund/applications-portfolio-manager-server", image_tag_mutability="MUTABLE", force_delete=True, image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( @@ -142,7 +153,7 @@ ensemble_manager_repository = aws.ecr.Repository( "ensemble_manager_repository", - name="fund/ensemble-manager-server", + name="fund/applications-ensemble-manager-server", image_tag_mutability="MUTABLE", force_delete=True, image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( @@ -157,9 +168,9 @@ policy=_ecr_lifecycle_policy, ) -tide_model_runner_repository = aws.ecr.Repository( - "tide_model_runner_repository", - name="fund/tide-model-runner", +tide_runner_repository = aws.ecr.Repository( + "tide_runner_repository", + name="fund/models-tide-runner", image_tag_mutability="MUTABLE", force_delete=True, image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( @@ -169,8 +180,8 @@ ) aws.ecr.LifecyclePolicy( - "tide_model_runner_repository_lifecycle", - repository=tide_model_runner_repository.name, + "tide_runner_repository_lifecycle", + repository=tide_runner_repository.name, policy=_ecr_lifecycle_policy, ) @@ -185,6 +196,6 @@ ensemble_manager_image_uri = ensemble_manager_repository.repository_url.apply( lambda url: f"{url}:latest" ) -tide_model_runner_image_uri = tide_model_runner_repository.repository_url.apply( +tide_runner_image_uri = tide_runner_repository.repository_url.apply( lambda url: f"{url}:latest" ) diff --git a/infrastructure/training.py b/infrastructure/training.py index 11997cae..a96a360c 100644 --- a/infrastructure/training.py +++ b/infrastructure/training.py @@ -5,7 +5,7 @@ from config import tags from iam import execution_role, task_role from networking import ecs_security_group, private_subnet_1, private_subnet_2 -from storage import tide_model_runner_image_uri +from storage import tide_runner_image_uri models_cluster = aws.ecs.Cluster( "models_cluster", @@ -164,7 +164,7 @@ memory="14336", execution_role_arn=execution_role.arn, task_role_arn=task_role.arn, - container_definitions=tide_model_runner_image_uri.apply( + container_definitions=tide_runner_image_uri.apply( lambda image_uri: json.dumps( [ { diff --git a/libraries/python/tests/test_infrastructure_notifications.py b/libraries/python/tests/test_infrastructure_notifications.py new file mode 100644 index 00000000..ce508adc --- /dev/null +++ b/libraries/python/tests/test_infrastructure_notifications.py @@ -0,0 +1,34 @@ +from pathlib import Path + +REPOSITORY_ROOT = Path(__file__).resolve().parents[3] +INFRASTRUCTURE_NOTIFICATIONS_PATH = ( + REPOSITORY_ROOT / "infrastructure" / "notifications.py" +) + + +def load_infrastructure_notifications() -> str: + return INFRASTRUCTURE_NOTIFICATIONS_PATH.read_text(encoding="utf-8") + + +def test_notifications_contains_cost_anomaly_monitor_resource() -> None: + infrastructure_notifications = load_infrastructure_notifications() + + assert '"cost_anomaly_monitor"' in infrastructure_notifications + + +def test_notifications_contains_cost_anomaly_subscription_resource() -> None: + infrastructure_notifications = load_infrastructure_notifications() + + assert '"cost_anomaly_subscription"' in infrastructure_notifications + + +def test_notifications_anomaly_subscription_uses_plural_dimensions_key() -> None: + infrastructure_notifications = load_infrastructure_notifications() + + assert '"Dimensions"' in infrastructure_notifications + + +def test_notifications_contains_budget_resource() -> None: + infrastructure_notifications = load_infrastructure_notifications() + + assert '"production_cost_budget"' in infrastructure_notifications diff --git a/libraries/python/tests/test_infrastructure_storage.py b/libraries/python/tests/test_infrastructure_storage.py index cdd60f4f..afdf9565 100644 --- a/libraries/python/tests/test_infrastructure_storage.py +++ b/libraries/python/tests/test_infrastructure_storage.py @@ -28,4 +28,4 @@ def test_storage_contains_ecr_lifecycle_policy_resources() -> None: assert '"data_manager_repository_lifecycle"' in infrastructure_storage assert '"portfolio_manager_repository_lifecycle"' in infrastructure_storage assert '"ensemble_manager_repository_lifecycle"' in infrastructure_storage - assert '"tide_model_runner_repository_lifecycle"' in infrastructure_storage + assert '"tide_runner_repository_lifecycle"' in infrastructure_storage diff --git a/maskfile.md b/maskfile.md index a4c69cb0..d3b7d916 100644 --- a/maskfile.md +++ b/maskfile.md @@ -54,7 +54,7 @@ echo "Development environment setup completed successfully" #### build-and-push (package_name) (stage_name) -> Build and push Docker image directly to ECR (e.g. `portfolio-manager server`, `tide model-runner`) +> Build and push Docker image directly to ECR (e.g. `portfolio-manager server`, `tide runner`) ```bash set -euo pipefail @@ -69,7 +69,19 @@ if [ -z "$aws_region" ]; then fi commit_hash=$(git rev-parse --short HEAD) -repository_name="fund/${package_name}-${stage_name}" + +if [ -f "models/${package_name}/Dockerfile" ]; then + dockerfile="models/${package_name}/Dockerfile" + build_target="${stage_name}" + namespace="models" +else + resolved_name=$(echo "${package_name}" | tr '-' '_') + dockerfile="applications/${resolved_name}/Dockerfile" + build_target="${stage_name}" + namespace="applications" +fi + +repository_name="fund/${namespace}-${package_name}-${stage_name}" image_reference="${aws_account_id}.dkr.ecr.${aws_region}.amazonaws.com/${repository_name}" echo "Logging into ECR" @@ -88,19 +100,11 @@ if [ "$existing_image" != "NONE" ] && [ "$existing_image" != "None" ] && [ -n "$ exit 0 fi -if [ -f "models/${package_name}/Dockerfile" ]; then - dockerfile="models/${package_name}/Dockerfile" - build_target="${stage_name}" -else - resolved_name=$(echo "${package_name}" | tr '-' '_') - dockerfile="applications/${resolved_name}/Dockerfile" - build_target="${stage_name}" -fi cache_reference="${image_reference}:buildcache" # Use GHA backend for caching when running in GitHub Actions if [ -n "${GITHUB_ACTIONS:-}" ]; then - scope="${package_name}-${stage_name}" + scope="${namespace}-${package_name}-${stage_name}" echo "Running in GitHub Actions - using hybrid cache (gha + registry) with scope: ${scope}" cache_from_arguments="--cache-from type=gha,scope=${scope} --cache-from type=registry,ref=${cache_reference}" cache_to_arguments="--cache-to type=gha,scope=${scope},mode=max --cache-to type=registry,ref=${cache_reference},mode=max" @@ -836,7 +840,7 @@ if ! organization_name=$(pulumi org get-default 2>/dev/null) || [ -z "${organiza fi pulumi stack select "${organization_name}/fund/production" -tide_image_uri=$(pulumi stack output aws_ecr_tide_model_runner_image) +tide_image_uri=$(pulumi stack output aws_ecr_models_tide_runner_image) cd "${MASKFILE_DIR}" diff --git a/models/tide/Dockerfile b/models/tide/Dockerfile index 8fb2a53b..23dee93d 100644 --- a/models/tide/Dockerfile +++ b/models/tide/Dockerfile @@ -14,7 +14,7 @@ COPY tools/ tools/ RUN uv sync --no-dev --package tide -FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS model-runner +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS runner ENV DEBIAN_FRONTEND=noninteractive ENV TZ=UTC diff --git a/models/tide/tests/test_deploy.py b/models/tide/tests/test_deploy.py index 06155ff4..03e1f3c3 100644 --- a/models/tide/tests/test_deploy.py +++ b/models/tide/tests/test_deploy.py @@ -31,7 +31,7 @@ def test_deploy_training_flow_sets_build_options( mock_deploy = MagicMock() mock_pipeline.deploy = mock_deploy - image = "123456789.dkr.ecr.us-east-1.amazonaws.com/fund/tide-model-runner:latest" + image = "123456789.dkr.ecr.us-east-1.amazonaws.com/fund/models-tide-runner:latest" deploy_training_flow(image=image) call_kwargs = mock_deploy.call_args.kwargs