Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions .github/workflows/launch_infrastructure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ jobs:
- application: ensemble-manager
stage: server
paths: applications/ensemble_manager/**
- application: model-trainer
stage: server-worker
- application: tide
stage: model-runner
paths: models/**
Comment thread
forstmeier marked this conversation as resolved.
steps:
- name: Checkout code
Expand Down Expand Up @@ -112,12 +112,6 @@ jobs:
- application: ensemble-manager
stage: server
paths: applications/ensemble_manager/**
- application: model-trainer
stage: server
paths: models/**
- application: model-trainer
stage: worker
paths: models/**
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down
38 changes: 15 additions & 23 deletions devenv.nix
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ in {
scripts.ecs-deploy.exec = ''
unset AWS_ENDPOINT_URL AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY
SERVICE="$1"
CLUSTER="fund-application"
CLUSTER="fund-applications"

if [ -z "$SERVICE" ]; then
echo "Usage: ecs-deploy <${lib.concatStringsSep "|" deployableServices}|all>"
Expand Down Expand Up @@ -227,7 +227,7 @@ in {
# Show ECS service status
scripts.ecs-status.exec = ''
unset AWS_ENDPOINT_URL AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY
CLUSTER="fund-application"
CLUSTER="fund-applications"
echo "=== ECS Services ==="
aws ecs list-services --cluster "$CLUSTER" --region ${awsRegion} --query 'serviceArns[*]' --output table 2>/dev/null || echo "Cluster not found"
echo ""
Expand Down Expand Up @@ -269,17 +269,11 @@ in {
'';

# Create ECS work pool and register training deployment on Prefect Cloud
scripts.training-init.exec = ''
if [ -z "$PREFECT_API_KEY" ]; then
echo "PREFECT_API_KEY not set. Add it to .envrc and run 'direnv allow'."
exit 1
fi

# Override the local dev PREFECT_API_URL so the CLI targets Prefect Cloud
scripts.initialize-remote-trainer.exec = ''
unset PREFECT_API_URL

echo "Creating fund-work-pool-ecs work pool on Prefect Cloud..."
uv run --package tools prefect work-pool create "fund-work-pool-ecs" --type ecs 2>/dev/null \
echo "Creating fund-models-remote work pool on Prefect Cloud..."
uv run --package tools prefect work-pool create "fund-models-remote" --type ecs 2>/dev/null \
|| echo " already exists"

echo "Registering training deployments..."
Expand All @@ -292,20 +286,18 @@ in {
# --- Local dev commands ---

# Create work pool and register training deployment locally
scripts.training-setup.exec = ''
scripts.initialize-local-trainer.exec = ''
echo "Waiting for orchestrator..."
while ! curl -sf http://localhost:4200/api/health > /dev/null 2>&1; do
sleep 2
done

echo "Creating fund-work-pool-local work pool..."
PREFECT_API_URL="http://localhost:4200/api" \
uv run --package tools prefect work-pool create "fund-work-pool-local" --type process 2>/dev/null \
echo "Creating fund-models-local work pool..."
uv run --package tools prefect work-pool create "fund-models-local" --type process 2>/dev/null \
|| echo " already exists"

echo "Registering daily-training deployment..."
PREFECT_API_URL="http://localhost:4200/api" \
uv run --package tide python -m tide.deploy
echo "Registering local training deployment..."
uv run prefect --no-prompt deploy --name tide-trainer-local

Comment thread
forstmeier marked this conversation as resolved.
echo ""
echo "Done. Visit http://localhost:4200 to see the orchestrator dashboard."
Expand Down Expand Up @@ -363,12 +355,12 @@ in {

# Create work pool and register deployment on first startup
PREFECT_API_URL="http://localhost:4200/api" \
uv run --package tools prefect work-pool create "fund-work-pool-local" --type process 2>/dev/null || true
uv run --package tools prefect work-pool create "fund-models-local" --type process 2>/dev/null || true
PREFECT_API_URL="http://localhost:4200/api" \
uv run --package tide python -m tide.deploy 2>/dev/null || true

cd tools
exec uv run prefect worker start --pool fund-work-pool-local --name worker-1
exec uv run prefect worker start --pool fund-models-local --name worker-1
'';

training-worker-2.exec = ''
Expand All @@ -377,7 +369,7 @@ in {
done
sleep 3
cd tools
exec uv run prefect worker start --pool fund-work-pool-local --name worker-2
exec uv run prefect worker start --pool fund-models-local --name worker-2
'';

data-manager.exec = ''
Expand Down Expand Up @@ -447,10 +439,10 @@ in {
echo " ecs-deploy <svc> Force ECS service redeployment"
echo " deploy <svc|all> Build, push, and redeploy (ecr-push + ecs-deploy)"
echo " ecs-status Show ECS service status"
echo " training-init Create work pool + register deployment (prod)"
echo " initialize-remote-trainer Create work pool + register deployment (prod)"
echo ""
echo " Local:"
echo " training-setup Create work pool + register deployment (local)"
echo " initialize-local-trainer Create work pool + register deployment (local)"
echo " cleanup-services Kill stale local processes"
'';

Expand Down
12 changes: 7 additions & 5 deletions infrastructure/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
model_artifacts_bucket,
portfolio_manager_image_uri,
portfolio_manager_repository,
tide_runner_image_uri,
tide_runner_repository,
tide_model_runner_image_uri,
tide_model_runner_repository,
)
from training import models_cluster

protocol = "https://" if acm_certificate_arn else "http://"

Expand All @@ -33,6 +34,7 @@
pulumi.export("aws_account_id", account_id)
pulumi.export("aws_vpc_id", vpc.id)
pulumi.export("aws_ecs_cluster_name", cluster.name)
pulumi.export("aws_ecs_models_cluster_name", models_cluster.name)
pulumi.export("aws_alb_dns_name", alb.dns_name)
pulumi.export("aws_alb_url", pulumi.Output.concat(protocol, alb.dns_name))
pulumi.export("aws_service_discovery_namespace", service_discovery_namespace.name)
Expand All @@ -52,10 +54,10 @@
pulumi.Output.unsecret(model_artifacts_bucket.bucket),
)
pulumi.export(
"aws_ecr_tide_runner_repository",
tide_runner_repository.repository_url,
"aws_ecr_tide_model_runner_repository",
tide_model_runner_repository.repository_url,
)
pulumi.export("aws_ecr_tide_runner_image", tide_runner_image_uri)
pulumi.export("aws_ecr_tide_model_runner_image", tide_model_runner_image_uri)
pulumi.export(
"aws_iam_github_actions_infrastructure_role_arn",
github_actions_infrastructure_role.arn,
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

cluster = aws.ecs.Cluster(
"ecs_cluster",
name="fund-application",
name="fund-applications",
Comment thread
forstmeier marked this conversation as resolved.
settings=[aws.ecs.ClusterSettingArgs(name="containerInsights", value="enabled")],
tags=tags,
Comment thread
forstmeier marked this conversation as resolved.
)
Expand Down
78 changes: 77 additions & 1 deletion infrastructure/iam.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
"Sid": "ManageEC2ECSELBBudgetsAndServiceDiscovery",
"Effect": "Allow",
"Action": [
"autoscaling:*",
"ec2:*",
"ecs:*",
"elasticloadbalancing:*",
Expand Down Expand Up @@ -259,6 +260,7 @@
"Condition": {
"StringEquals": {
"iam:AWSServiceName": [
"autoscaling.amazonaws.com",
"ecs.amazonaws.com",
"elasticloadbalancing.amazonaws.com",
]
Expand Down Expand Up @@ -372,6 +374,77 @@
tags=tags,
)

github_actions_trainer_policy = aws.iam.Policy(
"github_actions_trainer_policy",
name="fund-github-actions-trainer-policy",
description="Trainer infrastructure permissions for GitHub Actions deployments.",
policy=json.dumps(
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "CreateTrainerRole",
"Effect": "Allow",
"Action": "iam:CreateRole",
"Resource": "*",
"Condition": {
"StringEquals": {
"iam:RoleName": "fund-models-instance-role",
}
},
},
{
"Sid": "ManageTrainerRole",
"Effect": "Allow",
"Action": [
"iam:AttachRolePolicy",
"iam:DeleteRole",
"iam:DetachRolePolicy",
"iam:PassRole",
"iam:TagRole",
"iam:UntagRole",
"iam:UpdateAssumeRolePolicy",
],
"Resource": (
f"arn:aws:iam::{account_id}:role/fund-models-instance-role"
),
"Condition": {
"ArnLikeIfExists": {
"iam:PolicyARN": [
"arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role",
f"arn:aws:iam::{account_id}:policy/fund-*",
]
},
"StringLikeIfExists": {
"iam:PassedToService": "ec2.amazonaws.com",
},
},
},
{
"Sid": "ManageTrainingInstanceProfile",
"Effect": "Allow",
"Action": [
"iam:AddRoleToInstanceProfile",
"iam:CreateInstanceProfile",
"iam:DeleteInstanceProfile",
"iam:GetInstanceProfile",
"iam:RemoveRoleFromInstanceProfile",
"iam:TagInstanceProfile",
"iam:UntagInstanceProfile",
],
"Resource": (
f"arn:aws:iam::{account_id}:instance-profile"
"/fund-models-instance-profile"
),
},
],
},
sort_keys=True,
),
opts=pulumi.ResourceOptions(retain_on_delete=True),
tags=tags,
)

github_actions_infrastructure_role = aws.iam.Role(
"github_actions_infrastructure_role",
name=github_actions_role_name,
Expand Down Expand Up @@ -403,7 +476,10 @@
sort_keys=True,
)
),
managed_policy_arns=[github_actions_infrastructure_policy.arn],
managed_policy_arns=[
github_actions_infrastructure_policy.arn,
github_actions_trainer_policy.arn,
],
opts=pulumi.ResourceOptions(retain_on_delete=True),
tags=tags,
)
Expand Down
12 changes: 6 additions & 6 deletions infrastructure/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,9 @@
policy=_ecr_lifecycle_policy,
)

tide_runner_repository = aws.ecr.Repository(
"tide_runner_repository",
name="fund/tide-runner",
tide_model_runner_repository = aws.ecr.Repository(
"tide_model_runner_repository",
name="fund/tide-model-runner",
image_tag_mutability="MUTABLE",
force_delete=True,
Comment thread
forstmeier marked this conversation as resolved.
image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs(
Expand All @@ -169,8 +169,8 @@
)

aws.ecr.LifecyclePolicy(
"tide_runner_repository_lifecycle",
repository=tide_runner_repository.name,
"tide_model_runner_repository_lifecycle",
repository=tide_model_runner_repository.name,
policy=_ecr_lifecycle_policy,
)

Expand All @@ -185,6 +185,6 @@
ensemble_manager_image_uri = ensemble_manager_repository.repository_url.apply(
lambda url: f"{url}:latest"
)
tide_runner_image_uri = tide_runner_repository.repository_url.apply(
tide_model_runner_image_uri = tide_model_runner_repository.repository_url.apply(
lambda url: f"{url}:latest"
)
Loading
Loading