Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/launch_infrastructure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
stage: server
paths: applications/ensemble_manager/**
- application: tide
stage: model-runner
stage: runner
paths: models/**
Comment thread
forstmeier marked this conversation as resolved.
steps:
- name: Checkout code
Expand Down
31 changes: 20 additions & 11 deletions infrastructure/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
model_artifacts_bucket,
portfolio_manager_image_uri,
portfolio_manager_repository,
tide_model_runner_image_uri,
tide_model_runner_repository,
tide_runner_image_uri,
tide_runner_repository,
)
from training import models_cluster, tide_trainer_task_definition

Expand Down Expand Up @@ -44,26 +44,35 @@
pulumi.export("aws_alb_dns_name", alb.dns_name)
pulumi.export("aws_alb_url", pulumi.Output.concat(protocol, alb.dns_name))
pulumi.export("aws_service_discovery_namespace", service_discovery_namespace.name)
pulumi.export("aws_ecr_data_manager_image", data_manager_image_uri)
pulumi.export("aws_ecr_portfolio_manager_image", portfolio_manager_image_uri)
pulumi.export("aws_ecr_ensemble_manager_image", ensemble_manager_image_uri)
pulumi.export("aws_ecr_data_manager_repository", data_manager_repository.repository_url)
pulumi.export("aws_ecr_applications_data_manager_server_image", data_manager_image_uri)
pulumi.export(
"aws_ecr_portfolio_manager_repository", portfolio_manager_repository.repository_url
"aws_ecr_applications_portfolio_manager_server_image", portfolio_manager_image_uri
)
pulumi.export(
"aws_ecr_ensemble_manager_repository", ensemble_manager_repository.repository_url
"aws_ecr_applications_ensemble_manager_server_image", ensemble_manager_image_uri
)
pulumi.export(
"aws_ecr_applications_data_manager_server_repository",
data_manager_repository.repository_url,
)
pulumi.export(
"aws_ecr_applications_portfolio_manager_server_repository",
portfolio_manager_repository.repository_url,
)
pulumi.export(
"aws_ecr_applications_ensemble_manager_server_repository",
ensemble_manager_repository.repository_url,
)
pulumi.export("aws_s3_data_bucket_name", pulumi.Output.unsecret(data_bucket.bucket))
pulumi.export(
"aws_s3_model_artifacts_bucket_name",
pulumi.Output.unsecret(model_artifacts_bucket.bucket),
)
pulumi.export(
"aws_ecr_tide_model_runner_repository",
tide_model_runner_repository.repository_url,
"aws_ecr_models_tide_runner_repository",
tide_runner_repository.repository_url,
)
pulumi.export("aws_ecr_tide_model_runner_image", tide_model_runner_image_uri)
pulumi.export("aws_ecr_models_tide_runner_image", tide_runner_image_uri)
pulumi.export(
"aws_iam_github_actions_infrastructure_role_arn",
github_actions_infrastructure_role.arn,
Expand Down
45 changes: 45 additions & 0 deletions infrastructure/notifications.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import json

import pulumi
import pulumi_aws as aws
from config import (
account_id,
Expand All @@ -23,6 +26,48 @@
endpoint=notification_email_address,
)

cost_anomaly_monitor = aws.costexplorer.AnomalyMonitor(
"cost_anomaly_monitor",
name="fund-cost-anomaly-monitor",
monitor_type="CUSTOM",
monitor_specification=json.dumps(
{
"Dimensions": {
"Key": "LINKED_ACCOUNT",
"Values": [account_id],
"MatchOptions": ["EQUALS"],
}
}
),
tags=tags,
)

aws.costexplorer.AnomalySubscription(
"cost_anomaly_subscription",
Comment thread
forstmeier marked this conversation as resolved.
name="fund-cost-anomaly-subscription",
monitor_arn_lists=[cost_anomaly_monitor.arn],
frequency="IMMEDIATE",
threshold_expression=json.dumps(
{
"Dimensions": {
"Key": "ANOMALY_TOTAL_IMPACT_ABSOLUTE",
"Values": ["25"],
"MatchOptions": ["GREATER_THAN_OR_EQUAL"],
}
}
Comment thread
forstmeier marked this conversation as resolved.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
),
Comment thread
forstmeier marked this conversation as resolved.
subscribers=pulumi.Output.from_input(budget_alert_email_addresses).apply(
lambda emails: [
aws.costexplorer.AnomalySubscriptionSubscriberArgs(
address=email,
type="EMAIL",
)
for email in emails
]
),
tags=tags,
)
Comment thread
forstmeier marked this conversation as resolved.

# This can be updated by setting the monthlyBudgetLimitUsd Pulumi configuration
# variable.
aws.budgets.Budget(
Expand Down
31 changes: 21 additions & 10 deletions infrastructure/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,18 @@
"countNumber": 1,
},
"action": {"type": "expire"},
}
},
{
"rulePriority": 2,
"description": "Keep last 10 tagged images",
"selection": {
"tagStatus": "tagged",
"tagPatternList": ["git-*"],
"countType": "imageCountMoreThan",
"countNumber": 10,
},
"action": {"type": "expire"},
},
]
}
)
Expand Down Expand Up @@ -108,7 +119,7 @@
# retain_on_delete=True and add pulumi import statements to the maskfile up command.
data_manager_repository = aws.ecr.Repository(
"data_manager_repository",
name="fund/data-manager-server",
name="fund/applications-data-manager-server",
image_tag_mutability="MUTABLE",
Comment thread
forstmeier marked this conversation as resolved.
force_delete=True,
Comment thread
forstmeier marked this conversation as resolved.
image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs(
Expand All @@ -125,7 +136,7 @@

portfolio_manager_repository = aws.ecr.Repository(
"portfolio_manager_repository",
name="fund/portfolio-manager-server",
name="fund/applications-portfolio-manager-server",
image_tag_mutability="MUTABLE",
force_delete=True,
image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs(
Comment thread
forstmeier marked this conversation as resolved.
Expand All @@ -142,7 +153,7 @@

ensemble_manager_repository = aws.ecr.Repository(
"ensemble_manager_repository",
name="fund/ensemble-manager-server",
name="fund/applications-ensemble-manager-server",
image_tag_mutability="MUTABLE",
force_delete=True,
image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs(
Comment thread
forstmeier marked this conversation as resolved.
Expand All @@ -157,9 +168,9 @@
policy=_ecr_lifecycle_policy,
)

tide_model_runner_repository = aws.ecr.Repository(
"tide_model_runner_repository",
name="fund/tide-model-runner",
tide_runner_repository = aws.ecr.Repository(
"tide_runner_repository",
name="fund/models-tide-runner",
image_tag_mutability="MUTABLE",
force_delete=True,
Comment thread
forstmeier marked this conversation as resolved.
image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs(
Expand All @@ -169,8 +180,8 @@
)

aws.ecr.LifecyclePolicy(
"tide_model_runner_repository_lifecycle",
repository=tide_model_runner_repository.name,
"tide_runner_repository_lifecycle",
repository=tide_runner_repository.name,
policy=_ecr_lifecycle_policy,
)

Expand All @@ -185,6 +196,6 @@
ensemble_manager_image_uri = ensemble_manager_repository.repository_url.apply(
lambda url: f"{url}:latest"
)
tide_model_runner_image_uri = tide_model_runner_repository.repository_url.apply(
tide_runner_image_uri = tide_runner_repository.repository_url.apply(
lambda url: f"{url}:latest"
)
4 changes: 2 additions & 2 deletions infrastructure/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from config import tags
from iam import execution_role, task_role
from networking import ecs_security_group, private_subnet_1, private_subnet_2
from storage import tide_model_runner_image_uri
from storage import tide_runner_image_uri

models_cluster = aws.ecs.Cluster(
"models_cluster",
Expand Down Expand Up @@ -164,7 +164,7 @@
memory="14336",
execution_role_arn=execution_role.arn,
task_role_arn=task_role.arn,
container_definitions=tide_model_runner_image_uri.apply(
container_definitions=tide_runner_image_uri.apply(
lambda image_uri: json.dumps(
[
{
Expand Down
34 changes: 34 additions & 0 deletions libraries/python/tests/test_infrastructure_notifications.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from pathlib import Path

REPOSITORY_ROOT = Path(__file__).resolve().parents[3]
INFRASTRUCTURE_NOTIFICATIONS_PATH = (
REPOSITORY_ROOT / "infrastructure" / "notifications.py"
)


def load_infrastructure_notifications() -> str:
return INFRASTRUCTURE_NOTIFICATIONS_PATH.read_text(encoding="utf-8")


def test_notifications_contains_cost_anomaly_monitor_resource() -> None:
infrastructure_notifications = load_infrastructure_notifications()

assert '"cost_anomaly_monitor"' in infrastructure_notifications


def test_notifications_contains_cost_anomaly_subscription_resource() -> None:
infrastructure_notifications = load_infrastructure_notifications()

assert '"cost_anomaly_subscription"' in infrastructure_notifications


def test_notifications_anomaly_subscription_uses_plural_dimensions_key() -> None:
infrastructure_notifications = load_infrastructure_notifications()

assert '"Dimensions"' in infrastructure_notifications


def test_notifications_contains_budget_resource() -> None:
infrastructure_notifications = load_infrastructure_notifications()

assert '"production_cost_budget"' in infrastructure_notifications
2 changes: 1 addition & 1 deletion libraries/python/tests/test_infrastructure_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ def test_storage_contains_ecr_lifecycle_policy_resources() -> None:
assert '"data_manager_repository_lifecycle"' in infrastructure_storage
assert '"portfolio_manager_repository_lifecycle"' in infrastructure_storage
assert '"ensemble_manager_repository_lifecycle"' in infrastructure_storage
assert '"tide_model_runner_repository_lifecycle"' in infrastructure_storage
assert '"tide_runner_repository_lifecycle"' in infrastructure_storage
28 changes: 16 additions & 12 deletions maskfile.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ echo "Development environment setup completed successfully"

#### build-and-push (package_name) (stage_name)

> Build and push Docker image directly to ECR (e.g. `portfolio-manager server`, `tide model-runner`)
> Build and push Docker image directly to ECR (e.g. `portfolio-manager server`, `tide runner`)

```bash
set -euo pipefail
Expand All @@ -69,7 +69,19 @@ if [ -z "$aws_region" ]; then
fi

commit_hash=$(git rev-parse --short HEAD)
repository_name="fund/${package_name}-${stage_name}"

if [ -f "models/${package_name}/Dockerfile" ]; then
dockerfile="models/${package_name}/Dockerfile"
build_target="${stage_name}"
namespace="models"
else
resolved_name=$(echo "${package_name}" | tr '-' '_')
dockerfile="applications/${resolved_name}/Dockerfile"
build_target="${stage_name}"
namespace="applications"
fi

repository_name="fund/${namespace}-${package_name}-${stage_name}"
image_reference="${aws_account_id}.dkr.ecr.${aws_region}.amazonaws.com/${repository_name}"

echo "Logging into ECR"
Expand All @@ -88,19 +100,11 @@ if [ "$existing_image" != "NONE" ] && [ "$existing_image" != "None" ] && [ -n "$
exit 0
fi

if [ -f "models/${package_name}/Dockerfile" ]; then
dockerfile="models/${package_name}/Dockerfile"
build_target="${stage_name}"
else
resolved_name=$(echo "${package_name}" | tr '-' '_')
dockerfile="applications/${resolved_name}/Dockerfile"
build_target="${stage_name}"
fi
cache_reference="${image_reference}:buildcache"

# Use GHA backend for caching when running in GitHub Actions
if [ -n "${GITHUB_ACTIONS:-}" ]; then
scope="${package_name}-${stage_name}"
scope="${namespace}-${package_name}-${stage_name}"
echo "Running in GitHub Actions - using hybrid cache (gha + registry) with scope: ${scope}"
cache_from_arguments="--cache-from type=gha,scope=${scope} --cache-from type=registry,ref=${cache_reference}"
cache_to_arguments="--cache-to type=gha,scope=${scope},mode=max --cache-to type=registry,ref=${cache_reference},mode=max"
Expand Down Expand Up @@ -836,7 +840,7 @@ if ! organization_name=$(pulumi org get-default 2>/dev/null) || [ -z "${organiza
fi

pulumi stack select "${organization_name}/fund/production"
tide_image_uri=$(pulumi stack output aws_ecr_tide_model_runner_image)
tide_image_uri=$(pulumi stack output aws_ecr_models_tide_runner_image)

cd "${MASKFILE_DIR}"

Expand Down
2 changes: 1 addition & 1 deletion models/tide/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ COPY tools/ tools/

RUN uv sync --no-dev --package tide

FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS model-runner
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS runner

ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=UTC
Expand Down
2 changes: 1 addition & 1 deletion models/tide/tests/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_deploy_training_flow_sets_build_options(
mock_deploy = MagicMock()
mock_pipeline.deploy = mock_deploy

image = "123456789.dkr.ecr.us-east-1.amazonaws.com/fund/tide-model-runner:latest"
image = "123456789.dkr.ecr.us-east-1.amazonaws.com/fund/models-tide-runner:latest"
deploy_training_flow(image=image)

call_kwargs = mock_deploy.call_args.kwargs
Expand Down
Loading