diff --git a/applications/data_manager/Dockerfile b/applications/data_manager/Dockerfile index bfd5510f..ed2ad064 100644 --- a/applications/data_manager/Dockerfile +++ b/applications/data_manager/Dockerfile @@ -37,7 +37,12 @@ RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder -ENV CARGO_BUILD_JOBS=4 +# Limit parallel jobs to avoid OOM on memory-constrained builder nodes +ENV CARGO_BUILD_JOBS=1 + +# opt-level=1 ensures cargo-chef cook and cargo build use the same profile, +# preventing cache invalidation; acceptable trade-off for a data manager service +ENV CARGO_PROFILE_RELEASE_OPT_LEVEL=1 COPY --from=planner /app/recipe.json recipe.json diff --git a/infrastructure/compute.py b/infrastructure/compute.py index 209bc989..9610566a 100644 --- a/infrastructure/compute.py +++ b/infrastructure/compute.py @@ -239,21 +239,21 @@ data_manager_log_group = aws.cloudwatch.LogGroup( "data_manager_logs", - name="/ecs/fund/data-manager-server", + name="/ecs/fund/applications-data-manager-server", retention_in_days=7, tags=tags, ) portfolio_manager_log_group = aws.cloudwatch.LogGroup( "portfolio_manager_logs", - name="/ecs/fund/portfolio-manager-server", + name="/ecs/fund/applications-portfolio-manager-server", retention_in_days=7, tags=tags, ) ensemble_manager_log_group = aws.cloudwatch.LogGroup( "ensemble_manager_logs", - name="/ecs/fund/ensemble-manager-server", + name="/ecs/fund/applications-ensemble-manager-server", retention_in_days=7, tags=tags, ) @@ -440,6 +440,10 @@ "name": "DISABLE_DISK_CACHE", "value": "1", }, + { + "name": "AWS_S3_MODEL_ARTIFACT_PATH", + "value": "artifacts/tide/", + }, ], "secrets": [ { @@ -559,6 +563,7 @@ task_definition=ensemble_manager_task_definition.arn, desired_count=1, launch_type="FARGATE", + health_check_grace_period_seconds=180, network_configuration=aws.ecs.ServiceNetworkConfigurationArgs( subnets=[private_subnet_1.id, private_subnet_2.id], security_groups=[ecs_security_group.id], diff --git a/infrastructure/notifications.py b/infrastructure/notifications.py index 0d17f439..3aba060d 100644 --- a/infrastructure/notifications.py +++ b/infrastructure/notifications.py @@ -26,6 +26,34 @@ endpoint=notification_email_address, ) +infrastructure_alerts_topic_policy = aws.sns.TopicPolicy( + "infrastructure_alerts_topic_policy", + arn=infrastructure_alerts_topic.arn, + policy=aws.iam.get_policy_document_output( + statements=[ + aws.iam.GetPolicyDocumentStatementArgs( + sid="AWSCostAnomalyDetectionSNSPublishingPermissions", + effect="Allow", + actions=["SNS:Publish"], + principals=[ + aws.iam.GetPolicyDocumentStatementPrincipalArgs( + type="Service", + identifiers=["costalerts.amazonaws.com"], + ) + ], + resources=[infrastructure_alerts_topic.arn], + conditions=[ + aws.iam.GetPolicyDocumentStatementConditionArgs( + test="StringEquals", + variable="aws:SourceAccount", + values=[account_id], + ) + ], + ) + ] + ).json, +) + cost_anomaly_monitor = aws.costexplorer.AnomalyMonitor( "cost_anomaly_monitor", name="fund-cost-anomaly-monitor", @@ -47,24 +75,20 @@ name="fund-cost-anomaly-subscription", monitor_arn_lists=[cost_anomaly_monitor.arn], frequency="IMMEDIATE", - threshold_expression=json.dumps( - { - "Dimensions": { - "Key": "ANOMALY_TOTAL_IMPACT_ABSOLUTE", - "Values": ["25"], - "MatchOptions": ["GREATER_THAN_OR_EQUAL"], - } - } - ), - subscribers=pulumi.Output.from_input(budget_alert_email_addresses).apply( - lambda emails: [ - aws.costexplorer.AnomalySubscriptionSubscriberArgs( - address=email, - type="EMAIL", - ) - for email in emails - ] + threshold_expression=aws.costexplorer.AnomalySubscriptionThresholdExpressionArgs( + dimension=aws.costexplorer.AnomalySubscriptionThresholdExpressionDimensionArgs( + key="ANOMALY_TOTAL_IMPACT_ABSOLUTE", + values=["25"], + match_options=["GREATER_THAN_OR_EQUAL"], + ) ), + subscribers=[ + aws.costexplorer.AnomalySubscriptionSubscriberArgs( + address=infrastructure_alerts_topic.arn, + type="SNS", + ) + ], + opts=pulumi.ResourceOptions(depends_on=[infrastructure_alerts_topic_policy]), tags=tags, ) diff --git a/infrastructure/pyproject.toml b/infrastructure/pyproject.toml index 090ec2b2..c549a2cb 100644 --- a/infrastructure/pyproject.toml +++ b/infrastructure/pyproject.toml @@ -5,7 +5,7 @@ description = "Infrastructure management with Pulumi" requires-python = "==3.12.10" dependencies = [ "pulumi>=3.189.0", - "pulumi-aws>=7.4.0", + "pulumi-aws>=7.7.0", "pulumi-command>=1.1.0", "pulumi-docker>=4.10.0", "pulumi-tls>=5.2.1", diff --git a/infrastructure/training.py b/infrastructure/training.py index a96a360c..041322fe 100644 --- a/infrastructure/training.py +++ b/infrastructure/training.py @@ -1,8 +1,9 @@ import base64 import json +import pulumi import pulumi_aws as aws -from config import tags +from config import region, tags from iam import execution_role, task_role from networking import ecs_security_group, private_subnet_1, private_subnet_2 from storage import tide_runner_image_uri @@ -157,31 +158,35 @@ tide_trainer_task_definition = aws.ecs.TaskDefinition( "tide_trainer_task_definition", - family="fund-tide-trainer", + family="tide-runner", requires_compatibilities=["EC2"], network_mode="awsvpc", cpu="4096", memory="14336", execution_role_arn=execution_role.arn, task_role_arn=task_role.arn, - container_definitions=tide_runner_image_uri.apply( - lambda image_uri: json.dumps( + container_definitions=pulumi.Output.all( + models_log_group.name, + tide_runner_image_uri, + ).apply( + lambda args: json.dumps( [ { "name": "prefect", - "image": image_uri, + "image": args[1], "essential": True, "resourceRequirements": [{"type": "GPU", "value": "1"}], "logConfiguration": { "logDriver": "awslogs", "options": { - "awslogs-group": "/ecs/fund/models", - "awslogs-region": "us-east-1", + "awslogs-group": args[0], + "awslogs-region": region, "awslogs-stream-prefix": "tide", }, }, } - ] + ], + sort_keys=True, ) ), tags=tags, diff --git a/uv.lock b/uv.lock index 252efd08..5821962d 100644 --- a/uv.lock +++ b/uv.lock @@ -1141,7 +1141,7 @@ requires-dist = [ { name = "pip", specifier = ">=25.3,<26.0" }, { name = "protobuf", specifier = ">=5.29.5,<6.0.0" }, { name = "pulumi", specifier = ">=3.189.0" }, - { name = "pulumi-aws", specifier = ">=7.4.0" }, + { name = "pulumi-aws", specifier = ">=7.7.0" }, { name = "pulumi-command", specifier = ">=1.1.0" }, { name = "pulumi-docker", specifier = ">=4.10.0" }, { name = "pulumi-tls", specifier = ">=5.2.1" },