diff --git a/.github/workflows/launch_infrastructure.yaml b/.github/workflows/launch_infrastructure.yaml index 0b1ff265..44a03762 100644 --- a/.github/workflows/launch_infrastructure.yaml +++ b/.github/workflows/launch_infrastructure.yaml @@ -28,8 +28,8 @@ jobs: - application: ensemble-manager stage: server paths: applications/ensemble_manager/** - - application: model-trainer - stage: server-worker + - application: tide + stage: model-runner paths: models/** steps: - name: Checkout code @@ -112,12 +112,6 @@ jobs: - application: ensemble-manager stage: server paths: applications/ensemble_manager/** - - application: model-trainer - stage: server - paths: models/** - - application: model-trainer - stage: worker - paths: models/** steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/devenv.nix b/devenv.nix index 48894469..84717832 100644 --- a/devenv.nix +++ b/devenv.nix @@ -180,7 +180,7 @@ in { scripts.ecs-deploy.exec = '' unset AWS_ENDPOINT_URL AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY SERVICE="$1" - CLUSTER="fund-application" + CLUSTER="fund-applications" if [ -z "$SERVICE" ]; then echo "Usage: ecs-deploy <${lib.concatStringsSep "|" deployableServices}|all>" @@ -227,7 +227,7 @@ in { # Show ECS service status scripts.ecs-status.exec = '' unset AWS_ENDPOINT_URL AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY - CLUSTER="fund-application" + CLUSTER="fund-applications" echo "=== ECS Services ===" aws ecs list-services --cluster "$CLUSTER" --region ${awsRegion} --query 'serviceArns[*]' --output table 2>/dev/null || echo "Cluster not found" echo "" @@ -269,17 +269,11 @@ in { ''; # Create ECS work pool and register training deployment on Prefect Cloud - scripts.training-init.exec = '' - if [ -z "$PREFECT_API_KEY" ]; then - echo "PREFECT_API_KEY not set. Add it to .envrc and run 'direnv allow'." - exit 1 - fi - - # Override the local dev PREFECT_API_URL so the CLI targets Prefect Cloud + scripts.initialize-remote-trainer.exec = '' unset PREFECT_API_URL - echo "Creating fund-work-pool-ecs work pool on Prefect Cloud..." - uv run --package tools prefect work-pool create "fund-work-pool-ecs" --type ecs 2>/dev/null \ + echo "Creating fund-models-remote work pool on Prefect Cloud..." + uv run --package tools prefect work-pool create "fund-models-remote" --type ecs 2>/dev/null \ || echo " already exists" echo "Registering training deployments..." @@ -292,20 +286,18 @@ in { # --- Local dev commands --- # Create work pool and register training deployment locally - scripts.training-setup.exec = '' + scripts.initialize-local-trainer.exec = '' echo "Waiting for orchestrator..." while ! curl -sf http://localhost:4200/api/health > /dev/null 2>&1; do sleep 2 done - echo "Creating fund-work-pool-local work pool..." - PREFECT_API_URL="http://localhost:4200/api" \ - uv run --package tools prefect work-pool create "fund-work-pool-local" --type process 2>/dev/null \ + echo "Creating fund-models-local work pool..." + uv run --package tools prefect work-pool create "fund-models-local" --type process 2>/dev/null \ || echo " already exists" - echo "Registering daily-training deployment..." - PREFECT_API_URL="http://localhost:4200/api" \ - uv run --package tide python -m tide.deploy + echo "Registering local training deployment..." + uv run prefect --no-prompt deploy --name tide-trainer-local echo "" echo "Done. Visit http://localhost:4200 to see the orchestrator dashboard." @@ -363,12 +355,12 @@ in { # Create work pool and register deployment on first startup PREFECT_API_URL="http://localhost:4200/api" \ - uv run --package tools prefect work-pool create "fund-work-pool-local" --type process 2>/dev/null || true + uv run --package tools prefect work-pool create "fund-models-local" --type process 2>/dev/null || true PREFECT_API_URL="http://localhost:4200/api" \ uv run --package tide python -m tide.deploy 2>/dev/null || true cd tools - exec uv run prefect worker start --pool fund-work-pool-local --name worker-1 + exec uv run prefect worker start --pool fund-models-local --name worker-1 ''; training-worker-2.exec = '' @@ -377,7 +369,7 @@ in { done sleep 3 cd tools - exec uv run prefect worker start --pool fund-work-pool-local --name worker-2 + exec uv run prefect worker start --pool fund-models-local --name worker-2 ''; data-manager.exec = '' @@ -447,10 +439,10 @@ in { echo " ecs-deploy Force ECS service redeployment" echo " deploy Build, push, and redeploy (ecr-push + ecs-deploy)" echo " ecs-status Show ECS service status" - echo " training-init Create work pool + register deployment (prod)" + echo " initialize-remote-trainer Create work pool + register deployment (prod)" echo "" echo " Local:" - echo " training-setup Create work pool + register deployment (local)" + echo " initialize-local-trainer Create work pool + register deployment (local)" echo " cleanup-services Kill stale local processes" ''; diff --git a/infrastructure/__main__.py b/infrastructure/__main__.py index 6af26a17..bfd7d145 100644 --- a/infrastructure/__main__.py +++ b/infrastructure/__main__.py @@ -12,9 +12,10 @@ model_artifacts_bucket, portfolio_manager_image_uri, portfolio_manager_repository, - tide_runner_image_uri, - tide_runner_repository, + tide_model_runner_image_uri, + tide_model_runner_repository, ) +from training import models_cluster protocol = "https://" if acm_certificate_arn else "http://" @@ -33,6 +34,7 @@ pulumi.export("aws_account_id", account_id) pulumi.export("aws_vpc_id", vpc.id) pulumi.export("aws_ecs_cluster_name", cluster.name) +pulumi.export("aws_ecs_models_cluster_name", models_cluster.name) pulumi.export("aws_alb_dns_name", alb.dns_name) pulumi.export("aws_alb_url", pulumi.Output.concat(protocol, alb.dns_name)) pulumi.export("aws_service_discovery_namespace", service_discovery_namespace.name) @@ -52,10 +54,10 @@ pulumi.Output.unsecret(model_artifacts_bucket.bucket), ) pulumi.export( - "aws_ecr_tide_runner_repository", - tide_runner_repository.repository_url, + "aws_ecr_tide_model_runner_repository", + tide_model_runner_repository.repository_url, ) -pulumi.export("aws_ecr_tide_runner_image", tide_runner_image_uri) +pulumi.export("aws_ecr_tide_model_runner_image", tide_model_runner_image_uri) pulumi.export( "aws_iam_github_actions_infrastructure_role_arn", github_actions_infrastructure_role.arn, diff --git a/infrastructure/compute.py b/infrastructure/compute.py index 099aef3f..209bc989 100644 --- a/infrastructure/compute.py +++ b/infrastructure/compute.py @@ -28,7 +28,7 @@ cluster = aws.ecs.Cluster( "ecs_cluster", - name="fund-application", + name="fund-applications", settings=[aws.ecs.ClusterSettingArgs(name="containerInsights", value="enabled")], tags=tags, ) diff --git a/infrastructure/iam.py b/infrastructure/iam.py index c9912958..7eab9b6c 100644 --- a/infrastructure/iam.py +++ b/infrastructure/iam.py @@ -94,6 +94,7 @@ "Sid": "ManageEC2ECSELBBudgetsAndServiceDiscovery", "Effect": "Allow", "Action": [ + "autoscaling:*", "ec2:*", "ecs:*", "elasticloadbalancing:*", @@ -259,6 +260,7 @@ "Condition": { "StringEquals": { "iam:AWSServiceName": [ + "autoscaling.amazonaws.com", "ecs.amazonaws.com", "elasticloadbalancing.amazonaws.com", ] @@ -372,6 +374,77 @@ tags=tags, ) +github_actions_trainer_policy = aws.iam.Policy( + "github_actions_trainer_policy", + name="fund-github-actions-trainer-policy", + description="Trainer infrastructure permissions for GitHub Actions deployments.", + policy=json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CreateTrainerRole", + "Effect": "Allow", + "Action": "iam:CreateRole", + "Resource": "*", + "Condition": { + "StringEquals": { + "iam:RoleName": "fund-models-instance-role", + } + }, + }, + { + "Sid": "ManageTrainerRole", + "Effect": "Allow", + "Action": [ + "iam:AttachRolePolicy", + "iam:DeleteRole", + "iam:DetachRolePolicy", + "iam:PassRole", + "iam:TagRole", + "iam:UntagRole", + "iam:UpdateAssumeRolePolicy", + ], + "Resource": ( + f"arn:aws:iam::{account_id}:role/fund-models-instance-role" + ), + "Condition": { + "ArnLikeIfExists": { + "iam:PolicyARN": [ + "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role", + f"arn:aws:iam::{account_id}:policy/fund-*", + ] + }, + "StringLikeIfExists": { + "iam:PassedToService": "ec2.amazonaws.com", + }, + }, + }, + { + "Sid": "ManageTrainingInstanceProfile", + "Effect": "Allow", + "Action": [ + "iam:AddRoleToInstanceProfile", + "iam:CreateInstanceProfile", + "iam:DeleteInstanceProfile", + "iam:GetInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:TagInstanceProfile", + "iam:UntagInstanceProfile", + ], + "Resource": ( + f"arn:aws:iam::{account_id}:instance-profile" + "/fund-models-instance-profile" + ), + }, + ], + }, + sort_keys=True, + ), + opts=pulumi.ResourceOptions(retain_on_delete=True), + tags=tags, +) + github_actions_infrastructure_role = aws.iam.Role( "github_actions_infrastructure_role", name=github_actions_role_name, @@ -403,7 +476,10 @@ sort_keys=True, ) ), - managed_policy_arns=[github_actions_infrastructure_policy.arn], + managed_policy_arns=[ + github_actions_infrastructure_policy.arn, + github_actions_trainer_policy.arn, + ], opts=pulumi.ResourceOptions(retain_on_delete=True), tags=tags, ) diff --git a/infrastructure/storage.py b/infrastructure/storage.py index 6101dcf3..da650822 100644 --- a/infrastructure/storage.py +++ b/infrastructure/storage.py @@ -157,9 +157,9 @@ policy=_ecr_lifecycle_policy, ) -tide_runner_repository = aws.ecr.Repository( - "tide_runner_repository", - name="fund/tide-runner", +tide_model_runner_repository = aws.ecr.Repository( + "tide_model_runner_repository", + name="fund/tide-model-runner", image_tag_mutability="MUTABLE", force_delete=True, image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( @@ -169,8 +169,8 @@ ) aws.ecr.LifecyclePolicy( - "tide_runner_repository_lifecycle", - repository=tide_runner_repository.name, + "tide_model_runner_repository_lifecycle", + repository=tide_model_runner_repository.name, policy=_ecr_lifecycle_policy, ) @@ -185,6 +185,6 @@ ensemble_manager_image_uri = ensemble_manager_repository.repository_url.apply( lambda url: f"{url}:latest" ) -tide_runner_image_uri = tide_runner_repository.repository_url.apply( +tide_model_runner_image_uri = tide_model_runner_repository.repository_url.apply( lambda url: f"{url}:latest" ) diff --git a/infrastructure/training.py b/infrastructure/training.py new file mode 100644 index 00000000..ba539737 --- /dev/null +++ b/infrastructure/training.py @@ -0,0 +1,162 @@ +import base64 +import json + +import pulumi_aws as aws +from config import tags +from iam import execution_role, task_role +from networking import ecs_security_group, private_subnet_1, private_subnet_2 + +models_cluster = aws.ecs.Cluster( + "models_cluster", + name="fund-models", + settings=[aws.ecs.ClusterSettingArgs(name="containerInsights", value="enabled")], + tags=tags, +) + +models_instance_role = aws.iam.Role( + "models_instance_role", + name="fund-models-instance-role", + assume_role_policy=json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": {"Service": "ec2.amazonaws.com"}, + } + ], + }, + sort_keys=True, + ), + tags=tags, +) + +aws.iam.RolePolicyAttachment( + "models_instance_role_ecs_policy", + role=models_instance_role.name, + policy_arn="arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role", +) + +models_instance_profile = aws.iam.InstanceProfile( + "models_instance_profile", + name="fund-models-instance-profile", + role=models_instance_role.name, + tags=tags, +) + +models_ami_parameter = aws.ssm.get_parameter_output( + name="/aws/service/ecs/optimized-ami/amazon-linux-2023/gpu/recommended", +) + +models_ami_id = models_ami_parameter.value.apply( + lambda value: json.loads(value)["image_id"] +) + +models_launch_template = aws.ec2.LaunchTemplate( + "models_launch_template", + name="fund-models-gpu", + image_id=models_ami_id, + instance_type="g4dn.xlarge", + metadata_options=aws.ec2.LaunchTemplateMetadataOptionsArgs( + http_endpoint="enabled", + http_tokens="required", + ), + iam_instance_profile=aws.ec2.LaunchTemplateIamInstanceProfileArgs( + arn=models_instance_profile.arn, + ), + vpc_security_group_ids=[ecs_security_group.id], + user_data=models_cluster.name.apply( + lambda cluster_name: base64.b64encode( + "\n".join( + [ + "#!/bin/bash", + f"echo ECS_CLUSTER={cluster_name} >> /etc/ecs/ecs.config", + "echo ECS_ENABLE_GPU_SUPPORT=true >> /etc/ecs/ecs.config", + ] + ).encode() + ).decode() + ), + tag_specifications=[ + aws.ec2.LaunchTemplateTagSpecificationArgs( + resource_type="instance", + tags={**tags, "Name": "fund-models-gpu"}, + ), + aws.ec2.LaunchTemplateTagSpecificationArgs( + resource_type="volume", + tags={**tags, "Name": "fund-models-gpu"}, + ), + ], + tags=tags, +) + +models_asg = aws.autoscaling.Group( + "models_asg", + name="fund-models-gpu", + min_size=0, + max_size=1, + desired_capacity=0, + vpc_zone_identifiers=[private_subnet_1.id, private_subnet_2.id], + launch_template=aws.autoscaling.GroupLaunchTemplateArgs( + id=models_launch_template.id, + version="$Latest", + ), + protect_from_scale_in=True, + tags=[ + aws.autoscaling.GroupTagArgs( + key="Name", + value="fund-models-gpu", + propagate_at_launch=True, + ), + ] + + [ + aws.autoscaling.GroupTagArgs( + key=key, + value=value, + propagate_at_launch=True, + ) + for key, value in tags.items() + ], +) + +models_capacity_provider = aws.ecs.CapacityProvider( + "models_capacity_provider", + name="fund-models-gpu", + auto_scaling_group_provider=aws.ecs.CapacityProviderAutoScalingGroupProviderArgs( + auto_scaling_group_arn=models_asg.arn, + managed_scaling=aws.ecs.CapacityProviderAutoScalingGroupProviderManagedScalingArgs( + status="ENABLED", + target_capacity=100, + minimum_scaling_step_size=1, + maximum_scaling_step_size=1, + ), + managed_termination_protection="ENABLED", + ), + tags=tags, +) + +aws.ecs.ClusterCapacityProviders( + "models_cluster_capacity_providers", + cluster_name=models_cluster.name, + capacity_providers=[models_capacity_provider.name], + default_capacity_provider_strategies=[ + aws.ecs.ClusterCapacityProvidersDefaultCapacityProviderStrategyArgs( + capacity_provider=models_capacity_provider.name, + weight=1, + ) + ], +) + +models_log_group = aws.cloudwatch.LogGroup( + "models_logs", + name="/ecs/fund/models", + retention_in_days=7, + tags=tags, +) + +__all__ = [ + "execution_role", + "models_cluster", + "models_log_group", + "task_role", +] diff --git a/libraries/python/tests/test_infrastructure_iam.py b/libraries/python/tests/test_infrastructure_iam.py index 6e9b0398..9670bf12 100644 --- a/libraries/python/tests/test_infrastructure_iam.py +++ b/libraries/python/tests/test_infrastructure_iam.py @@ -18,10 +18,9 @@ def test_iam_attaches_custom_github_actions_policy() -> None: infrastructure_iam = load_infrastructure_iam() assert '"github_actions_infrastructure_policy"' in infrastructure_iam - assert ( - "managed_policy_arns=[github_actions_infrastructure_policy.arn]" - in infrastructure_iam - ) + assert '"github_actions_trainer_policy"' in infrastructure_iam + assert "github_actions_infrastructure_policy.arn" in infrastructure_iam + assert "github_actions_trainer_policy.arn" in infrastructure_iam def test_iam_scopes_oidc_provider_creation_statement() -> None: diff --git a/libraries/python/tests/test_infrastructure_storage.py b/libraries/python/tests/test_infrastructure_storage.py index afdf9565..cdd60f4f 100644 --- a/libraries/python/tests/test_infrastructure_storage.py +++ b/libraries/python/tests/test_infrastructure_storage.py @@ -28,4 +28,4 @@ def test_storage_contains_ecr_lifecycle_policy_resources() -> None: assert '"data_manager_repository_lifecycle"' in infrastructure_storage assert '"portfolio_manager_repository_lifecycle"' in infrastructure_storage assert '"ensemble_manager_repository_lifecycle"' in infrastructure_storage - assert '"tide_runner_repository_lifecycle"' in infrastructure_storage + assert '"tide_model_runner_repository_lifecycle"' in infrastructure_storage diff --git a/libraries/python/tests/test_infrastructure_training.py b/libraries/python/tests/test_infrastructure_training.py new file mode 100644 index 00000000..e2f9d6c2 --- /dev/null +++ b/libraries/python/tests/test_infrastructure_training.py @@ -0,0 +1,43 @@ +from pathlib import Path + +REPOSITORY_ROOT = Path(__file__).resolve().parents[3] +INFRASTRUCTURE_TRAINING_PATH = REPOSITORY_ROOT / "infrastructure" / "training.py" + + +def load_infrastructure_training() -> str: + return INFRASTRUCTURE_TRAINING_PATH.read_text(encoding="utf-8") + + +def test_models_cluster_uses_ec2_backed_capacity_provider() -> None: + infrastructure_training = load_infrastructure_training() + + assert '"models_cluster"' in infrastructure_training + assert '"models_capacity_provider"' in infrastructure_training + assert '"models_cluster_capacity_providers"' in infrastructure_training + + +def test_trainer_uses_gpu_instance_type() -> None: + infrastructure_training = load_infrastructure_training() + + assert 'instance_type="g4dn.xlarge"' in infrastructure_training + assert "amazon-linux-2023/gpu/recommended" in infrastructure_training + + +def test_trainer_asg_scales_to_zero() -> None: + infrastructure_training = load_infrastructure_training() + + assert "min_size=0" in infrastructure_training + assert "desired_capacity=0" in infrastructure_training + + +def test_models_instance_profile_configured() -> None: + infrastructure_training = load_infrastructure_training() + + assert '"models_instance_profile"' in infrastructure_training + assert "AmazonEC2ContainerServiceforEC2Role" in infrastructure_training + + +def test_trainer_gpu_support_enabled_in_user_data() -> None: + infrastructure_training = load_infrastructure_training() + + assert "ECS_ENABLE_GPU_SUPPORT=true" in infrastructure_training diff --git a/maskfile.md b/maskfile.md index cbe4aeba..098dc545 100644 --- a/maskfile.md +++ b/maskfile.md @@ -54,7 +54,7 @@ echo "Development environment setup completed successfully" #### build (package_name) (stage_name) -> Build Docker images with optional cache pull (e.g. `portfolio-manager server`, `tide runner`) +> Build Docker images with optional cache pull (e.g. `portfolio-manager server`, `tide model-runner`) ```bash set -euo pipefail @@ -133,7 +133,7 @@ echo "Image built: ${package_name} ${stage_name}" #### push (package_name) (stage_name) -> Push Docker image to ECR (e.g. `portfolio-manager server`, `tide runner`) +> Push Docker image to ECR (e.g. `portfolio-manager server`, `tide model-runner`) ```bash set -euo pipefail @@ -193,7 +193,7 @@ case "${package_name}-${stage_name}" in data-manager-server) service="fund-data-manager-server" ;; portfolio-manager-server) service="fund-portfolio-manager-server" ;; ensemble-manager-server) service="fund-ensemble-manager-server" ;; - tide-runner) echo "No ECS service for tide runner" && exit 0 ;; + tide-model-runner) echo "tide-model-runner is used for Prefect training jobs, not an ECS service" && exit 0 ;; *) echo "Unknown service: ${package_name}-${stage_name}" && exit 1 ;; esac @@ -415,6 +415,49 @@ case "$application_name" in esac ``` +### models + +> Manage Prefect Cloud model resources + +#### initialize (environment) + +> Create work pool and register deployments (environment: remote, local) + +```bash +set -euo pipefail + +case "${environment}" in + remote) + unset PREFECT_API_URL + + echo "Creating fund-models-remote work pool on Prefect Cloud..." + uv run --package tools prefect work-pool create "fund-models-remote" --type ecs 2>/dev/null \ + || echo " already exists" + + echo "Registering training deployments..." + uv run prefect --no-prompt deploy --all + + echo "" + echo "Done. Visit Prefect Cloud dashboard to view deployments." + ;; + local) + export PREFECT_API_URL="http://localhost:4200/api" + + echo "Creating fund-models-local work pool..." + uv run --package tools prefect work-pool create "fund-models-local" --type process 2>/dev/null \ + || echo " already exists" + + echo "Registering local training deployment..." + uv run prefect --no-prompt deploy --name tide-trainer-local + ;; + *) + echo "Unknown environment: ${environment}" + echo "Valid options: remote, local" + exit 1 + ;; +esac +``` + ## development > Python and Rust development tools and code quality checks @@ -802,25 +845,14 @@ esac ### deploy (model_name) -> Register flow deployment with Prefect server +> Register flow deployment with Prefect Cloud ```bash set -euo pipefail -cd infrastructure - -if ! organization_name=$(pulumi org get-default 2>/dev/null) || [ -z "${organization_name}" ]; then - echo "Unable to determine Pulumi organization name - ensure you are logged in" - exit 1 -fi - -pulumi stack select ${organization_name}/fund/production - -export PREFECT_API_URL="$(pulumi stack output fund_base_url)" +unset PREFECT_API_URL export FUND_LOOKBACK_DAYS="${FUND_LOOKBACK_DAYS:-365}" -cd ../ - case "${model_name}" in tide) uv run python -m tide.deploy diff --git a/models/Dockerfile b/models/Dockerfile deleted file mode 100644 index d74356b3..00000000 --- a/models/Dockerfile +++ /dev/null @@ -1,63 +0,0 @@ -FROM python:3.12.10-slim AS builder - -COPY --from=ghcr.io/astral-sh/uv:0.7.2 /uv /bin/uv - -RUN apt-get update && \ - apt-get install -y --no-install-recommends build-essential clang && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -COPY pyproject.toml uv.lock ./ - -COPY tools/ tools/ - -COPY applications/ensemble_manager/ applications/ensemble_manager/ - -COPY models/tide/ models/tide/ - -COPY libraries/python/ libraries/python/ - -RUN uv sync --no-dev - -FROM python:3.12.10-slim AS server-worker - -WORKDIR /app - -RUN groupadd --system worker && useradd --system --gid worker worker - -COPY --from=ghcr.io/astral-sh/uv:0.7.2 /uv /bin/uv - -COPY --from=builder /app /app - -ENV PYTHONPATH=/app/tools/src:/app/applications/ensemble_manager/src:/app/libraries/python/src -ENV HOME=/home/worker - -RUN mkdir -p /home/worker && chown -R worker:worker /home/worker /app && \ - printf '%s\n' \ - '#!/usr/bin/env python3' \ - 'import os' \ - 'import sys' \ - '' \ - 'for process_id in os.listdir("/proc"):' \ - ' if not process_id.isdigit():' \ - ' continue' \ - ' command_path = f"/proc/{process_id}/cmdline"' \ - ' try:' \ - ' with open(command_path, "rb") as command_file:' \ - ' command = command_file.read().replace(b"\\x00", b" ").decode("utf-8", errors="ignore")' \ - ' except OSError:' \ - ' continue' \ - ' if "prefect worker start" in command and "training-pool" in command:' \ - ' sys.exit(0)' \ - '' \ - 'sys.exit(1)' \ - > /usr/local/bin/prefect_worker_healthcheck.py && \ - chmod +x /usr/local/bin/prefect_worker_healthcheck.py - -USER worker - -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD ["python", "/usr/local/bin/prefect_worker_healthcheck.py"] - -ENTRYPOINT ["uv", "run", "--package", "tools", "prefect", "worker", "start", "--pool", "training-pool", "--type", "process"] diff --git a/models/tide/Dockerfile b/models/tide/Dockerfile index 28d46086..fe979a98 100644 --- a/models/tide/Dockerfile +++ b/models/tide/Dockerfile @@ -14,7 +14,7 @@ COPY tools/ tools/ RUN uv sync --no-dev --package tide -FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS runner +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS model-runner ENV DEBIAN_FRONTEND=noninteractive ENV TZ=UTC diff --git a/models/tide/src/tide/deploy.py b/models/tide/src/tide/deploy.py index 56b68140..6581982f 100644 --- a/models/tide/src/tide/deploy.py +++ b/models/tide/src/tide/deploy.py @@ -19,8 +19,8 @@ def deploy_training_flow( ) training_pipeline.deploy( - name="daily-training", - work_pool_name="training-pool", + name="tide-trainer-remote", + work_pool_name="fund-models-remote", cron="0 22 * * 1-5", timezone="America/New_York", parameters={ diff --git a/models/tide/tests/test_deploy.py b/models/tide/tests/test_deploy.py index 3fe6db1c..2cedb751 100644 --- a/models/tide/tests/test_deploy.py +++ b/models/tide/tests/test_deploy.py @@ -15,8 +15,8 @@ def test_deploy_training_flow_calls_deploy(mock_pipeline: MagicMock) -> None: mock_deploy.assert_called_once() call_kwargs = mock_deploy.call_args.kwargs - assert call_kwargs["name"] == "daily-training" - assert call_kwargs["work_pool_name"] == "training-pool" + assert call_kwargs["name"] == "tide-trainer-remote" + assert call_kwargs["work_pool_name"] == "fund-models-remote" assert call_kwargs["cron"] == "0 22 * * 1-5" assert call_kwargs["timezone"] == "America/New_York" assert call_kwargs["parameters"]["lookback_days"] == LOOKBACK_DAYS diff --git a/prefect.yaml b/prefect.yaml index 71bd3b2a..5f936240 100644 --- a/prefect.yaml +++ b/prefect.yaml @@ -8,7 +8,7 @@ deployments: - name: tide-trainer-remote entrypoint: models/tide/src/tide/workflow.py:training_pipeline work_pool: - name: fund-work-pool-ecs + name: fund-models-remote parameters: {} build: null push: null @@ -16,7 +16,7 @@ deployments: - name: tide-trainer-local entrypoint: models/tide/src/tide/workflow.py:training_pipeline work_pool: - name: fund-work-pool-local + name: fund-models-local parameters: {} build: null push: null