From 4b9f51639473c4e66b1e6eb92060ba5811e6703d Mon Sep 17 00:00:00 2001 From: chrisaddy Date: Thu, 14 Aug 2025 09:24:23 -0400 Subject: [PATCH] docker swarm --- .claude/settings.local.json | 5 +- .flox/env/manifest.lock | 245 +++++++++- .flox/env/manifest.toml | 4 +- .gitignore | 5 +- .mise.toml | 10 +- Dockerfile.tests | 2 +- infrastructure/.claude/settings.local.json | 13 + infrastructure/.gitignore | 2 + infrastructure/.python-version | 1 + infrastructure/Pulumi.yaml | 14 +- infrastructure/__main__.py | 469 ++++++------------- infrastructure/api.py | 162 ------- infrastructure/cluster.py | 188 -------- infrastructure/grafana-dashboard.json | 505 --------------------- infrastructure/ingress.py | 114 ----- infrastructure/keys.py | 54 --- infrastructure/main.nu | 57 +++ infrastructure/monitors.py | 151 ------ infrastructure/prometheus.yml | 16 + infrastructure/pyproject.toml | 14 +- infrastructure/requirements.txt | 2 + infrastructure/services.py | 345 -------------- infrastructure/stack.yml | 138 ++++++ infrastructure/tags.py | 13 - infrastructure/upload_grafana_dashboard.nu | 186 -------- infrastructure/vpc.py | 134 ------ uv.lock | 6 +- 27 files changed, 649 insertions(+), 2206 deletions(-) create mode 100644 infrastructure/.claude/settings.local.json create mode 100644 infrastructure/.gitignore create mode 100644 infrastructure/.python-version delete mode 100644 infrastructure/api.py delete mode 100644 infrastructure/cluster.py delete mode 100644 infrastructure/grafana-dashboard.json delete mode 100644 infrastructure/ingress.py delete mode 100644 infrastructure/keys.py create mode 100644 infrastructure/main.nu delete mode 100644 infrastructure/monitors.py create mode 100644 infrastructure/prometheus.yml create mode 100644 infrastructure/requirements.txt delete mode 100644 infrastructure/services.py create mode 100644 infrastructure/stack.yml delete mode 100644 infrastructure/tags.py delete mode 100644 infrastructure/upload_grafana_dashboard.nu delete mode 100644 infrastructure/vpc.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 76c4521b6..478bf6ce9 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -16,7 +16,10 @@ "Bash(cat:*)", "Bash(mise run:*)", "Bash(gh issue close:*)", - "Bash(ln:*)" + "Bash(ln:*)", + "WebSearch", + "Bash(docker build:*)", + "Bash(uv lock:*)" ], "deny": [], "defaultMode": "acceptEdits" diff --git a/.flox/env/manifest.lock b/.flox/env/manifest.lock index 833f05d04..ed575a585 100644 --- a/.flox/env/manifest.lock +++ b/.flox/env/manifest.lock @@ -3,8 +3,11 @@ "manifest": { "version": 1, "install": { - "awscli": { - "pkg-path": "awscli" + "awscli2": { + "pkg-path": "awscli2" + }, + "cargo": { + "pkg-path": "cargo" }, "fselect": { "pkg-path": "fselect" @@ -51,7 +54,243 @@ }, "packages": [ { - "attr_path": "awscli", + "attr_path": "awscli2", + "broken": false, + "derivation": "/nix/store/v5gxglicwhyknfwswhngrbawzdwnapp5-awscli2-2.27.2.drv", + "description": "Unified tool to manage your AWS services", + "install_id": "awscli2", + "license": "Apache-2.0", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "awscli2-2.27.2", + "pname": "awscli2", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T04:22:53.969182Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "2.27.2", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/sc270lyggbhz78lcxh9khqmggayqpp6i-awscli2-2.27.2-dist", + "out": "/nix/store/b5fa0fplfpiyx8zwc9y7d08a57lsk229-awscli2-2.27.2" + }, + "system": "aarch64-darwin", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "awscli2", + "broken": false, + "derivation": "/nix/store/7ykn42yxjf7ncyl2z00700hq214xs4jn-awscli2-2.27.2.drv", + "description": "Unified tool to manage your AWS services", + "install_id": "awscli2", + "license": "Apache-2.0", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "awscli2-2.27.2", + "pname": "awscli2", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T04:41:01.084855Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "2.27.2", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/ss8xqzwn1l8qdbwlml439bx1ik13vxlk-awscli2-2.27.2-dist", + "out": "/nix/store/161s22pbddllc4mabgr5rvrpmhiffsp4-awscli2-2.27.2" + }, + "system": "aarch64-linux", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "awscli2", + "broken": false, + "derivation": "/nix/store/id7b0p6cwlkcsnfb767s820ig9xj0qnp-awscli2-2.27.2.drv", + "description": "Unified tool to manage your AWS services", + "install_id": "awscli2", + "license": "Apache-2.0", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "awscli2-2.27.2", + "pname": "awscli2", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T04:56:58.790359Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "2.27.2", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/q9rzj2ikpl9vn4sns3ydsy7i8sachnha-awscli2-2.27.2-dist", + "out": "/nix/store/l18j8wgidf145m0zw6ydg8blim44hrkq-awscli2-2.27.2" + }, + "system": "x86_64-darwin", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "awscli2", + "broken": false, + "derivation": "/nix/store/j2ajkr3y4c0fg9lxl22z30hd4spb74d9-awscli2-2.27.2.drv", + "description": "Unified tool to manage your AWS services", + "install_id": "awscli2", + "license": "Apache-2.0", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "awscli2-2.27.2", + "pname": "awscli2", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T05:15:55.508841Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "2.27.2", + "outputs_to_install": [ + "out" + ], + "outputs": { + "dist": "/nix/store/r5cw3141myaf5lnzra9c0nc8prb2zv6y-awscli2-2.27.2-dist", + "out": "/nix/store/gx3hx0b3lbm11sishxl4k7a7ha4w405q-awscli2-2.27.2" + }, + "system": "x86_64-linux", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "cargo", + "broken": false, + "derivation": "/nix/store/6665a8zg1cpw2fjamds6261fmg1pa458-cargo-1.86.0.drv", + "description": "Downloads your Rust project's dependencies and builds your project", + "install_id": "cargo", + "license": "[ MIT, Apache-2.0 ]", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "cargo-1.86.0", + "pname": "cargo", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T04:22:54.464772Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "1.86.0", + "outputs_to_install": [ + "out" + ], + "outputs": { + "out": "/nix/store/v9kavzgzbym8adb6p7nxgid13jz76411-cargo-1.86.0" + }, + "system": "aarch64-darwin", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "cargo", + "broken": false, + "derivation": "/nix/store/jq0483mr9g99x5swkgvdf8w0hhj7yq73-cargo-1.86.0.drv", + "description": "Downloads your Rust project's dependencies and builds your project", + "install_id": "cargo", + "license": "[ MIT, Apache-2.0 ]", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "cargo-1.86.0", + "pname": "cargo", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T04:41:01.216082Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "1.86.0", + "outputs_to_install": [ + "out" + ], + "outputs": { + "out": "/nix/store/4y830m5qm2xfvdacbsy3q1afxqnawihc-cargo-1.86.0" + }, + "system": "aarch64-linux", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "cargo", + "broken": false, + "derivation": "/nix/store/za3xhiav2xk7z8g0synrxz3az6ai0i2f-cargo-1.86.0.drv", + "description": "Downloads your Rust project's dependencies and builds your project", + "install_id": "cargo", + "license": "[ MIT, Apache-2.0 ]", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "cargo-1.86.0", + "pname": "cargo", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T04:56:58.878560Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "1.86.0", + "outputs_to_install": [ + "out" + ], + "outputs": { + "out": "/nix/store/mdqqq9g5zrisd3ng2j56a9gijr5fawp5-cargo-1.86.0" + }, + "system": "x86_64-darwin", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "cargo", + "broken": false, + "derivation": "/nix/store/3f2cdkg4a3gjsw9mnb604ihqk1kq2dp2-cargo-1.86.0.drv", + "description": "Downloads your Rust project's dependencies and builds your project", + "install_id": "cargo", + "license": "[ MIT, Apache-2.0 ]", + "locked_url": "https://github.com/flox/nixpkgs?rev=96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "name": "cargo-1.86.0", + "pname": "cargo", + "rev": "96ec055edbe5ee227f28cdbc3f1ddf1df5965102", + "rev_count": 807377, + "rev_date": "2025-05-28T19:24:49Z", + "scrape_date": "2025-05-31T05:15:55.650404Z", + "stabilities": [ + "unstable" + ], + "unfree": false, + "version": "1.86.0", + "outputs_to_install": [ + "out" + ], + "outputs": { + "out": "/nix/store/npqlgsia03kfhv8m9mav6hfnbawpg0yg-cargo-1.86.0" + }, + "system": "x86_64-linux", + "group": "toplevel", + "priority": 5 + }, + { + "attr_path": "fselect", "broken": false, "derivation": "/nix/store/admmm4b6lm869qnc1d487slsr6a9hl29-awscli-1.37.21.drv", "description": "Unified tool to manage your AWS services", diff --git a/.flox/env/manifest.toml b/.flox/env/manifest.toml index 3c63b9eec..881c74c8a 100644 --- a/.flox/env/manifest.toml +++ b/.flox/env/manifest.toml @@ -11,7 +11,9 @@ yamllint.pkg-path = "yamllint" nushell.pkg-path = "nushell" fselect.pkg-path = "fselect" google-cloud-sdk.pkg-path = "google-cloud-sdk" -awscli.pkg-path = "awscli" +awscli2.pkg-path = "awscli2" +cargo.pkg-path = "cargo" + [hook] on-activate = ''' diff --git a/.gitignore b/.gitignore index ddc5257ee..a6c591539 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,4 @@ infrastructure/Pulumi.production.yaml .coverage* .coverage/ coverage.xml -infrastructure/kubeconfig.json -*.egg-info -wandb/ -*.csv +infrastructure/swarm.pem diff --git a/.mise.toml b/.mise.toml index 5d0e7a8e9..5f4fe9afa 100644 --- a/.mise.toml +++ b/.mise.toml @@ -45,19 +45,15 @@ depends = ["python:lint"] description = "Run code quality checks" run = """ nu linter.nu -yamllint -d "{extends: relaxed, rules: {line-length: {max: 110}}}" . +yamllint -d "{extends: relaxed, rules: {line-length: {max: 110}}}" . """ [tasks."infrastructure:up"] description = "Launch cloud infrastructure" -# temporarily comment out ping test due to non-exposed endpoints run = """ -set -e cd infrastructure -uv run pulumi up --yes --stack pocketsizefund/pocketsizefund/production -pulumi stack output KUBECONFIG > kubeconfig.json -export KUBECONFIG=<(pulumi stack output KUBECONFIG) -# nu ping.nu +source main.nu +infrastructure up """ [tasks."infrastructure:down"] diff --git a/Dockerfile.tests b/Dockerfile.tests index baefe4946..57263e176 100644 --- a/Dockerfile.tests +++ b/Dockerfile.tests @@ -1,4 +1,4 @@ -FROM python:3.13 +FROM python:3.12.10 COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv diff --git a/infrastructure/.claude/settings.local.json b/infrastructure/.claude/settings.local.json new file mode 100644 index 000000000..b9640655b --- /dev/null +++ b/infrastructure/.claude/settings.local.json @@ -0,0 +1,13 @@ +{ + "permissions": { + "allow": [ + "Bash(pulumi stack output:*)", + "Bash(nu:*)", + "Bash(docker stack ps:*)" + ], + "defaultMode": "acceptEdits", + "additionalDirectories": [ + "/Users/chrisaddy/Library/Application Support/nushell" + ] + } +} \ No newline at end of file diff --git a/infrastructure/.gitignore b/infrastructure/.gitignore new file mode 100644 index 000000000..a3807e5bd --- /dev/null +++ b/infrastructure/.gitignore @@ -0,0 +1,2 @@ +*.pyc +venv/ diff --git a/infrastructure/.python-version b/infrastructure/.python-version new file mode 100644 index 000000000..e4fba2183 --- /dev/null +++ b/infrastructure/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/infrastructure/Pulumi.yaml b/infrastructure/Pulumi.yaml index d5baba95c..ad1909c28 100644 --- a/infrastructure/Pulumi.yaml +++ b/infrastructure/Pulumi.yaml @@ -1,4 +1,10 @@ ---- -name: pocketsizefund -runtime: python -description: Pocket Size Fund infrastructure +name: infrastructure +description: pocketsizefund infra +runtime: + name: python + options: + toolchain: uv +config: + pulumi:tags: + value: + pulumi:template: aws-python diff --git a/infrastructure/__main__.py b/infrastructure/__main__.py index 39b28f053..cf002280f 100644 --- a/infrastructure/__main__.py +++ b/infrastructure/__main__.py @@ -1,340 +1,163 @@ -import json -import tomllib -from pathlib import Path - import pulumi import pulumi_aws as aws -from api import ( - create_api_access_iam_role, - create_api_gateway, - create_knative_service_api_gateway_integrations, - create_virtual_private_cloud_link, -) -from cluster import ( - create_kubernetes_cluster, - create_kubernetes_cluster_role, - create_kubernetes_node_role, - create_kubernetes_provider, - update_kubernetes_cluster_access, -) -from images import build_image -from ingress import ( - create_application_load_balancer, - create_application_load_balancer_listener, - create_application_load_balancer_security_group, - create_application_load_balancer_target_group, -) -from keys import create_duckdb_user_access_key -from pulumi.config import Config -from services import ( - create_knative_broker, - create_knative_eventing_core, - create_knative_schedule, - create_knative_service, - create_knative_serving_core, - create_knative_trigger, - create_service_environment_variables, -) -from vpc import ( - create_elastic_ip, - create_internet_gateway, - create_nat_gateway, - create_route_table, - create_subnet, - create_virtual_private_cloud, -) - -configuration = Config() - -virtual_private_cloud = create_virtual_private_cloud() - -internet_gateway = create_internet_gateway( - virtual_private_cloud=virtual_private_cloud, -) - -public_route_table = create_route_table( - virtual_private_cloud=virtual_private_cloud, - internet_gateway=internet_gateway, -) - -aws_region = configuration.get("aws:region") or "us-east-1" - -availability_zones = aws.get_availability_zones( - state="available", - filters=[ - { - "name": "region-name", - "values": [aws_region], - } - ], -).names[:3] - -public_subnets = [ - create_subnet( - virtual_private_cloud=virtual_private_cloud, - route_table=public_route_table, - availability_zone=availability_zones[i], - subnet_number=i + 1, # 1-3 - visibility="public", +import pulumi_tls as tls +from pulumi_command import remote + +az = pulumi.Config().get("az") or "us-east-1a" +blueprint_id = pulumi.Config().get("blueprintId") or "ubuntu_24_04" +bundle_mgr = pulumi.Config().get("bundleIdMgr") or "medium_2_0" +bundle_wkr = pulumi.Config().get("bundleIdWkr") or "small_2_0" + +ssh_key = tls.PrivateKey("swarm-key", algorithm="RSA", rsa_bits=4096) +ls_key = aws.lightsail.KeyPair( + "swarm-ls-key", + name="swarm-ls-key", + public_key=ssh_key.public_key_openssh, +) + +cloud_init = """#cloud-config +package_update: true +package_upgrade: true +write_files: + - path: /usr/local/bin/install-docker.sh + permissions: "0755" + content: | + #!/usr/bin/env bash + set -euo pipefail + retry() { for i in {1..10}; do "$@" && break || { sleep 3; echo "retry $i"; }; done; } + + retry apt-get update + retry apt-get install -y ca-certificates curl gnupg + + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + chmod a+r /etc/apt/keyrings/docker.gpg + + . /etc/os-release + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu ${VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list + + retry apt-get update + retry apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + usermod -aG docker ubuntu || true + systemctl enable --now docker + +runcmd: + - /usr/local/bin/install-docker.sh + - bash -lc 'for i in {1..60}; do [ -x /usr/bin/docker ] && systemctl is-active --quiet docker && exit 0 || sleep 2; done; exit 1' +""" + + +def mk_instance(name: str, bundle_id: str): + inst = aws.lightsail.Instance( + name, + name=name, + availability_zone=az, + blueprint_id=blueprint_id, + bundle_id=bundle_id, + key_pair_name=ls_key.name, + user_data=cloud_init, + tags={"role": name}, ) - for i in range(len(availability_zones)) -] - -elastic_ip = create_elastic_ip(virtual_private_cloud=virtual_private_cloud) -nat_gateway = create_nat_gateway( - elastic_ip=elastic_ip, - public_subnet=public_subnets[0], # one NAT instance for cost efficiency -) - -private_route_table = create_route_table( - virtual_private_cloud=virtual_private_cloud, - nat_gateway=nat_gateway, -) - - -private_subnets = [ - create_subnet( - virtual_private_cloud=virtual_private_cloud, - route_table=private_route_table, - availability_zone=availability_zones[i], - subnet_number=i + 4, # 4-6 - visibility="private", + aws.lightsail.InstancePublicPorts( + f"{name}-ports", + instance_name=inst.name, + port_infos=[ + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=22, to_port=22, protocol="tcp", cidrs=["0.0.0.0/0"] + ), + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=2377, to_port=2377, protocol="tcp", cidrs=["0.0.0.0/0"] + ), + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=7946, to_port=7946, protocol="tcp", cidrs=["0.0.0.0/0"] + ), + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=7946, to_port=7946, protocol="udp", cidrs=["0.0.0.0/0"] + ), + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=4789, to_port=4789, protocol="udp", cidrs=["0.0.0.0/0"] + ), + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=80, to_port=80, protocol="tcp", cidrs=["0.0.0.0/0"] + ), + aws.lightsail.InstancePublicPortsPortInfoArgs( + from_port=443, to_port=443, protocol="tcp", cidrs=["0.0.0.0/0"] + ), + ], ) - for i in range(len(availability_zones)) -] - -kubernetes_cluster_role = create_kubernetes_cluster_role() - -kubernetes_node_role = create_kubernetes_node_role() - -kubernetes_cluster = create_kubernetes_cluster( - virtual_private_cloud=virtual_private_cloud, - private_subnets=private_subnets, - kubernetes_cluster_role=kubernetes_cluster_role, - kubernetes_node_role=kubernetes_node_role, -) - -kubernetes_provider = create_kubernetes_provider(kubernetes_cluster=kubernetes_cluster) - -cluster_access_configuration = update_kubernetes_cluster_access( - kubernetes_provider=kubernetes_provider, - kubernetes_cluster_role=kubernetes_cluster_role, - kubernetes_node_role=kubernetes_node_role, - pulumi_user_arn=configuration.require_secret("AWS_IAM_PULUMI_USER_ARN"), - root_user_arn=configuration.require_secret("AWS_IAM_ROOT_USER_ARN"), -) - -knative_serving_core = create_knative_serving_core( - kubernetes_provider=kubernetes_provider, -) - -knative_eventing_core = create_knative_eventing_core( - kubernetes_provider=kubernetes_provider, -) - -knative_broker = create_knative_broker( - kubernetes_provider=kubernetes_provider, - knative_eventing_core=knative_eventing_core, -) - -duckdb_user_access_key = create_duckdb_user_access_key( - data_bucket_name=configuration.require_secret("AWS_S3_DATA_BUCKET_NAME"), -) - -service_environment_variables = create_service_environment_variables( - inputs=[ - ("ALPACA_API_KEY", configuration.require_secret("ALPACA_API_KEY")), - ("ALPACA_API_SECRET", configuration.require_secret("ALPACA_API_SECRET")), - ( - "AWS_S3_DATA_BUCKET_NAME", - configuration.require_secret("AWS_S3_DATA_BUCKET_NAME"), - ), - ("POLYGON_API_KEY", configuration.require_secret("POLYGON_API_KEY")), - ("AWS_IAM_DUCKDB_USER_ACCESS_KEY_ID", duckdb_user_access_key.id), - ("AWS_IAM_DUCKDB_USER_ACCESS_KEY_SECRET", duckdb_user_access_key.secret), - ("AWS_REGION", aws_region), - ], -) - -application_load_balancer_security_group = ( - create_application_load_balancer_security_group( - virtual_private_cloud=virtual_private_cloud, + ip = aws.lightsail.StaticIp(f"{name}-ip", name=f"{name}-ip") + aws.lightsail.StaticIpAttachment( + f"{name}-ip-attach", + instance_name=inst.name, + static_ip_name=ip.name, ) -) -application_load_balancer = create_application_load_balancer( - application_load_balancer_security_group=application_load_balancer_security_group, - public_subnets=public_subnets, -) + return inst, ip -virtual_private_cloud_link = create_virtual_private_cloud_link( - application_load_balancer_security_group=application_load_balancer_security_group, - public_subnets=public_subnets, -) -api_gateway = create_api_gateway( - application_load_balancer_security_group=application_load_balancer_security_group, -) +mgr_inst, mgr_ip = mk_instance("swarm-mgr-1", bundle_mgr) +w1_inst, w1_ip = mk_instance("swarm-wkr-1", bundle_wkr) +w2_inst, w2_ip = mk_instance("swarm-wkr-2", bundle_wkr) -target_group = create_application_load_balancer_target_group( - virtual_private_cloud=virtual_private_cloud, - application_load_balancer=application_load_balancer, -) -listener = create_application_load_balancer_listener( - application_load_balancer=application_load_balancer, - application_load_balancer_target_group=target_group, -) - -try: - with Path("pyproject.toml").open("rb") as f: - project_data = tomllib.load(f) - version = project_data.get("project", {}).get("version") - -except (FileNotFoundError, tomllib.TOMLDecodeError, ValueError) as e: - message = f"Failed to read version from infrastructure pyproject.toml: {e}" - raise RuntimeError(message) from e - -username = configuration.require_secret("DOCKERHUB_USERNAME") -password = configuration.require_secret("DOCKERHUB_PASSWORD") - -datamanager_image = build_image( - service_name="datamanager", - service_version=version, - dockerhub_username=username, - dockerhub_password=password, -) - -datamanager_knative_service = create_knative_service( - kubernetes_provider=kubernetes_provider, - service_name="datamanager", - image=datamanager_image, - application_load_balancer_service_target_group=target_group, - knative_serving_core=knative_serving_core, - environment_variables=service_environment_variables, -) - -endpoint_information = [ - {"path": "/health", "method": "GET"}, - {"path": "/equity-bars", "method": "GET"}, - {"path": "/equity-bars/fetch", "method": "POST"}, - {"path": "/equity-bars", "method": "DELETE"}, -] - -create_knative_service_api_gateway_integrations( - service_name="datamanager", - endpoint_information=endpoint_information, - api_gateway=api_gateway, - application_load_balancer_listener=listener, - vpc_link=virtual_private_cloud_link, -) - -api_access_iam_role = create_api_access_iam_role( - api_gateway=api_gateway, - pulumi_user_arn=configuration.require_secret("AWS_IAM_PULUMI_USER_ARN"), - endpoint_information=endpoint_information, -) - -predictionengine_image = build_image( - service_name="predictionengine", - service_version=version, - dockerhub_username=username, - dockerhub_password=password, -) - -predictionengine_knative_service = create_knative_service( - kubernetes_provider=kubernetes_provider, - service_name="predictionengine", - image=predictionengine_image, - application_load_balancer_service_target_group=target_group, - knative_serving_core=knative_serving_core, - environment_variables=service_environment_variables, -) - -positionmanager_image = build_image( - service_name="positionmanager", - service_version=version, - dockerhub_username=username, - dockerhub_password=password, -) - -positionmanager_knative_service = create_knative_service( - kubernetes_provider=kubernetes_provider, - service_name="positionmanager", - image=positionmanager_image, - application_load_balancer_service_target_group=target_group, - knative_serving_core=knative_serving_core, - environment_variables=service_environment_variables, -) - -open_positions_from_predictions_trigger = create_knative_trigger( - kubernetes_provider=kubernetes_provider, - source_service_name="predictionengine", - source_attribute_type="application.predictionengine.predictions.created", - target_service_name="positionmanager", - knative_eventing_core=knative_eventing_core, -) - -midnight_data_fetch_schedule = create_knative_schedule( - kubernetes_provider=kubernetes_provider, - target_service_name="datamanager", - target_path="/equity-bars/fetch", - cron_schedule="0 0 * * *", - knative_eventing_core=knative_eventing_core, -) - -monday_morning_open_positions_schedule = create_knative_schedule( - kubernetes_provider=kubernetes_provider, - target_service_name="predictionengine", - target_path="/predictions/create", - cron_schedule="0 10 * * 1", - knative_eventing_core=knative_eventing_core, -) - -friday_evening_close_positions_schedule = create_knative_schedule( - kubernetes_provider=kubernetes_provider, - target_service_name="positionmanager", - target_path="/positions/close", - cron_schedule="0 13 * * 5", - knative_eventing_core=knative_eventing_core, -) - - -pulumi.export("DATAMANAGER_SERVICE_IMAGE", datamanager_image.ref) - -pulumi.export( - "PREDICTIONENGINE_SERVICE_IMAGE", - predictionengine_image.ref, -) +def conn(host_output: pulumi.Output[str]) -> remote.ConnectionArgs: + return remote.ConnectionArgs( + host=host_output, + user="ubuntu", + private_key=ssh_key.private_key_pem, + ) -pulumi.export( - "POSITIONMANAGER_SERVICE_IMAGE", - positionmanager_image.ref, -) -pulumi.export( - "AWS_EKS_CLUSTER_NAME", - kubernetes_cluster.eks_cluster.name.apply(lambda cluster_name: f"{cluster_name}"), -) - -pulumi.export( - "AWS_EKS_KUBECONFIG", - kubernetes_cluster.kubeconfig.apply(json.dumps), -) +init_manager = remote.Command( + "init-manager", + connection=conn(mgr_ip.ip_address), + create=" && ".join( + [ + "bash -lc 'for i in {1..120}; do sudo docker info >/dev/null 2>&1 && break || sleep 3; done'", + "bash -lc 'PUBIP=$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4); echo Using-Public-IP:$PUBIP'", + 'bash -lc \'STATE=$(sudo docker info --format "{{.Swarm.LocalNodeState}}") || true; ' + 'CTRL=$(sudo docker info --format "{{.Swarm.ControlAvailable}}") || true; ' + '[ "$STATE" = active -a "$CTRL" = true ] || ' + "(sudo docker swarm leave --force || true; " + ' sudo docker swarm init --advertise-addr "$PUBIP" --listen-addr "0.0.0.0:2377")\'', + "bash -lc 'for i in {1..30}; do sudo ss -ltn | awk \"\\$4 ~ /:2377$/\" && break || sleep 2; done'", + "bash -lc 'sudo docker swarm join-token -q worker | sudo tee /home/ubuntu/worker.token >/dev/null'", + "bash -lc 'sudo docker swarm join-token -q manager | sudo tee /home/ubuntu/manager.token >/dev/null'", + ] + ), +) + + +get_worker_token = remote.Command( + "get-worker-token", + connection=conn(mgr_ip.ip_address), + create="bash -lc 'sudo docker swarm join-token -q worker'", + opts=pulumi.ResourceOptions(depends_on=[init_manager]), +) +worker_token = pulumi.Output.secret(get_worker_token.stdout).apply(lambda s: s.strip()) + + +def join_worker(res_name: str, worker_ip: pulumi.Output[str]) -> remote.Command: + create_cmd = pulumi.Output.all(mgr_ip.ip_address, worker_token).apply( + lambda vals: ( + "bash -lc 'for i in {1..120}; do sudo docker info >/dev/null 2>&1 && break || sleep 3; done && " + f"sudo docker swarm join --token {vals[1]} {vals[0]}:2377'" + ) + ) + return remote.Command( + res_name, + connection=conn(worker_ip), + create=create_cmd, + opts=pulumi.ResourceOptions(depends_on=[get_worker_token]), + ) -pulumi.export( - "AWS_VIRTUAL_PRIVATE_CLOUD_ID", - virtual_private_cloud.id.apply(lambda vpc_id: f"{vpc_id}"), -) -pulumi.export( - "AWS_API_GATEWAY_ACCESS_IAM_ROLE_ARN", - api_access_iam_role.arn.apply(lambda arn: f"{arn}"), -) +join_w1 = join_worker("join-w1", w1_ip.ip_address) +join_w2 = join_worker("join-w2", w2_ip.ip_address) -pulumi.export( - "AWS_API_GATEWAY_ENDPOINT_URL", - api_gateway.api_endpoint.apply(lambda endpoint: f"{endpoint}/production"), -) +pulumi.export("managerIp", mgr_ip.ip_address) +pulumi.export("workerIps", pulumi.Output.all(w1_ip.ip_address, w2_ip.ip_address)) +pulumi.export("sshPrivateKeyPem", pulumi.Output.secret(ssh_key.private_key_pem)) diff --git a/infrastructure/api.py b/infrastructure/api.py deleted file mode 100644 index 7e1b62d20..000000000 --- a/infrastructure/api.py +++ /dev/null @@ -1,162 +0,0 @@ -import json - -import pulumi -import pulumi_aws as aws -from tags import pulumi_tags - - -def create_virtual_private_cloud_link( - application_load_balancer_security_group: aws.ec2.SecurityGroup, - public_subnets: list[aws.ec2.Subnet], -) -> aws.apigatewayv2.VpcLink: - return aws.apigatewayv2.VpcLink( - resource_name="pocketsizefund-api-gateway-vpc-link", - name="pocketsizefund-vpc-link", - security_group_ids=[application_load_balancer_security_group.id], - subnet_ids=[subnet.id for subnet in public_subnets], - opts=pulumi.ResourceOptions( - depends_on=[ - application_load_balancer_security_group, - *public_subnets, - ] - ), - tags=pulumi_tags, - ) - - -def create_api_gateway( - application_load_balancer_security_group: aws.ec2.SecurityGroup, -) -> aws.apigatewayv2.Api: - api_gateway = aws.apigatewayv2.Api( - resource_name="pocketsizefund-api-gateway", - protocol_type="HTTP", - route_selection_expression="$request.method $request.path", - opts=pulumi.ResourceOptions( - depends_on=[application_load_balancer_security_group], - ), - tags=pulumi_tags, - ) - - aws.apigatewayv2.Stage( - resource_name="pocketsizefund-api-gateway-stage", - api_id=api_gateway.id, - name="production", - auto_deploy=True, - opts=pulumi.ResourceOptions( - depends_on=[ - api_gateway, - application_load_balancer_security_group, - ], - ), - tags=pulumi_tags, - ) - - return api_gateway - - -def create_knative_service_api_gateway_integrations( - service_name: str, - endpoint_information: list[dict[str, str]], - api_gateway: aws.apigatewayv2.Api, - application_load_balancer_listener: aws.lb.Listener, - vpc_link: aws.apigatewayv2.VpcLink, -) -> None: - for endpoint in endpoint_information: - endpoint_path = endpoint["path"].strip("/").replace("/", "-") - endpoint_path = "-".join(filter(None, endpoint_path.split("-"))) - endpoint_method_lower = endpoint["method"].lower() - endpoint_method_upper = endpoint["method"].upper() - - integration = aws.apigatewayv2.Integration( - resource_name=f"pocketsizefund-{service_name}-{endpoint_path}-{endpoint_method_lower}-api-gateway-integration", - api_id=api_gateway.id, - integration_type="HTTP_PROXY", - integration_uri=application_load_balancer_listener.arn, - integration_method=endpoint_method_upper, - connection_type="VPC_LINK", - connection_id=vpc_link.id, - opts=pulumi.ResourceOptions( - depends_on=[ - api_gateway, - application_load_balancer_listener, - vpc_link, - ], - ), - ) - - aws.apigatewayv2.Route( - resource_name=f"pocketsizefund-{service_name}-{endpoint_path}-{endpoint_method_lower}-api-gateway-route", - api_id=api_gateway.id, - route_key=f"{endpoint_method_upper} /{endpoint_path}", - target=integration.id.apply( - lambda integration_id: f"integrations/{integration_id}" - ), - opts=pulumi.ResourceOptions( - depends_on=[ - api_gateway, - application_load_balancer_listener, - vpc_link, - integration, - ], - ), - ) - - -def create_api_access_iam_role( - api_gateway: aws.apigatewayv2.Api, - pulumi_user_arn: pulumi.Output[str], - endpoint_information: list[dict[str, str]], -) -> aws.iam.Role: - api_access_iam_role = aws.iam.Role( - resource_name="pocketsizefund-api-access-role", - name="pocketsizefund-api-access-role", - description="Pocket Size Fund API access role", - assume_role_policy=pulumi_user_arn.apply( - lambda arn: json.dumps( - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": {"AWS": f"{arn}"}, - "Action": "sts:AssumeRole", - } - ], - } - ) - ), - opts=pulumi.ResourceOptions( - depends_on=[api_gateway], - ), - tags=pulumi_tags, - ) - - aws.iam.RolePolicy( - resource_name="pocketsizefund-api-access-role-policy", - role=api_access_iam_role.id, - policy=api_gateway.arn.apply( - lambda arn: json.dumps( - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": "execute-api:Invoke", - "Resource": [ - f"{arn}/*/{t['method'].upper()}/{t['path'].lstrip('/')}" - for t in endpoint_information - ], - } - ], - } - ) - ), - opts=pulumi.ResourceOptions( - depends_on=[ - api_gateway, - api_access_iam_role, - ], - ), - ) - - return api_access_iam_role diff --git a/infrastructure/cluster.py b/infrastructure/cluster.py deleted file mode 100644 index 1aaad965f..000000000 --- a/infrastructure/cluster.py +++ /dev/null @@ -1,188 +0,0 @@ -import json - -import pulumi -import pulumi_aws as aws -import pulumi_eks as eks -import pulumi_kubernetes as k8s -from tags import pulumi_tags - - -def create_kubernetes_cluster( - virtual_private_cloud: aws.ec2.Vpc, - private_subnets: list[aws.ec2.Subnet], - kubernetes_cluster_role: aws.iam.Role, - kubernetes_node_role: aws.iam.Role, -) -> eks.Cluster: - return eks.Cluster( - resource_name="pocketsizefund-kubernetes-cluster", - desired_capacity=2, - min_size=1, - max_size=3, - instance_type="t3.small", - service_role=kubernetes_cluster_role, - instance_role=kubernetes_node_role, - vpc_id=virtual_private_cloud.id, - subnet_ids=[subnet.id for subnet in private_subnets], - opts=pulumi.ResourceOptions( - depends_on=[ - virtual_private_cloud, - *private_subnets, - kubernetes_cluster_role, - kubernetes_node_role, - ], - ), - tags=pulumi_tags, - ) - - -def create_kubernetes_provider(kubernetes_cluster: eks.Cluster) -> k8s.Provider: - return k8s.Provider( - resource_name="pocketsizefund-kubernetes-provider", - kubeconfig=kubernetes_cluster.kubeconfig.apply(json.dumps), - opts=pulumi.ResourceOptions( - replace_on_changes=["kubeconfig"], - custom_timeouts=pulumi.CustomTimeouts( - create="10m", - update="10m", - delete="10m", - ), - depends_on=[kubernetes_cluster], - ), - ) - - -def update_kubernetes_cluster_access( - kubernetes_provider: k8s.Provider, - kubernetes_cluster_role: aws.iam.Role, - kubernetes_node_role: aws.iam.Role, - pulumi_user_arn: pulumi.Output[str], - root_user_arn: pulumi.Output[str], -) -> k8s.core.v1.ConfigMap: - map_roles = pulumi.Output.json_dumps( - [ - { - "rolearn": kubernetes_node_role.arn, - "username": "system:node:{{EC2PrivateDNSName}}", - "groups": [ - "system:bootstrappers", - "system:nodes", - ], - }, - { - "rolearn": kubernetes_cluster_role.arn, - "username": "system:master", - "groups": [ - "system:masters", - ], - }, - ] - ) - - map_users = pulumi.Output.json_dumps( - [ - { - "userarn": pulumi_user_arn, - "username": "pulumi-user", - "groups": ["system:masters"], - }, - { - "userarn": root_user_arn, - "username": "root-user", - "groups": ["system:masters"], - }, - ] - ) - - return k8s.core.v1.ConfigMap( - resource_name="pocketsizefund-aws-auth", - metadata=k8s.meta.v1.ObjectMetaArgs( - name="aws-auth", # name required by EKS - namespace="kube-system", - annotations={ - "pulumi.com/patchForce": "true", - }, - ), - data={ - "mapRoles": map_roles, - "mapUsers": map_users, - }, - opts=pulumi.ResourceOptions( - replace_on_changes=["*"], - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - kubernetes_cluster_role, - kubernetes_node_role, - ], - ), - ) - - -def create_kubernetes_cluster_role() -> aws.iam.Role: - assume_role_policy = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": {"Service": ["eks.amazonaws.com"]}, - "Action": "sts:AssumeRole", - } - ], - } - - cluster_role = aws.iam.Role( - resource_name="pocketsizefund-cluster-role", - description="Role for EKS cluster to manage resources", - name="pocketsizefund-cluster-role", - assume_role_policy=json.dumps(assume_role_policy), - tags=pulumi_tags, - ) - - aws.iam.RolePolicyAttachment( - resource_name="pocketsizefund-cluster-policy", - role=cluster_role.name, - policy_arn="arn:aws:iam::aws:policy/AmazonEKSClusterPolicy", - ) - - return cluster_role - - -def create_kubernetes_node_role() -> aws.iam.Role: - assume_role_policy = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": {"Service": ["ec2.amazonaws.com"]}, - "Action": "sts:AssumeRole", - } - ], - } - - node_role = aws.iam.Role( - resource_name="pocketsizefund-node-role", - description="Role for EKS worker nodes to manage resources", - name="pocketsizefund-node-role", - assume_role_policy=json.dumps(assume_role_policy), - tags=pulumi_tags, - ) - - aws.iam.RolePolicyAttachment( - resource_name="pocketsizefund-node-policy", - role=node_role.name, - policy_arn="arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy", - ) - - aws.iam.RolePolicyAttachment( - resource_name="pocketsizefund-node-role-ecr-policy", - role=node_role.name, - policy_arn="arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly", - ) - - aws.iam.RolePolicyAttachment( - resource_name="pocketsizefund-node-role-cni-policy", - role=node_role.name, - policy_arn="arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy", - ) - - return node_role diff --git a/infrastructure/grafana-dashboard.json b/infrastructure/grafana-dashboard.json deleted file mode 100644 index e4dde270a..000000000 --- a/infrastructure/grafana-dashboard.json +++ /dev/null @@ -1,505 +0,0 @@ -{ - "dashboard": { - "id": null, - "title": "Pocket Size Fund", - "description": "Comprehensive monitoring dashboard for Pocket Size Fund trading services, infrastructure, and performance metrics", - "tags": [ - "open-source", - "quantitative", - "hedge-fund" - ], - "style": "dark", - "timezone": "America/New_York", - "editable": true, - "graphTooltip": 1, - "time": { - "from": "now-1h", - "to": "now" - }, - "refresh": "30s", - "panels": [ - { - "id": 1, - "title": "Infrastructure Overview", - "type": "stat", - "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "up{job=\"kubernetes-nodes\"}", - "legendFormat": "Cluster Nodes", - "refId": "A" - }, - { - "expr": "sum(kube_pod_status_ready{condition=\"true\", namespace=\"default\"})", - "legendFormat": "Ready Pods", - "refId": "B" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "short", - "min": 0 - } - } - }, - { - "id": 2, - "title": "Service Health Status", - "type": "table", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "up{job=\"kubernetes-pods\", pod=~\".*datamanager.*|.*predictionengine.*|.*positionmanager.*\"}", - "legendFormat": "{{pod}}", - "refId": "A", - "format": "table", - "instant": true - } - ], - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "__name__": true, - "job": true, - "instance": true - }, - "renameByName": { - "pod": "Service", - "Value": "Status" - } - } - } - ], - "fieldConfig": { - "defaults": { - "custom": { - "displayMode": "color-background" - }, - "mappings": [ - { - "options": { - "0": { - "text": "Down", - "color": "red" - } - }, - "type": "value" - }, - { - "options": { - "1": { - "text": "Up", - "color": "green" - } - }, - "type": "value" - } - ] - } - } - }, - { - "id": 3, - "title": "API Request Rate", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "targets": [ - { - "expr": "sum(rate(http_requests_total{job=\"kubernetes-pods\", pod=~\".*datamanager.*\"}[5m])) by (method, path)", - "legendFormat": "{{method}} {{path}}", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "reqps", - "min": 0 - } - } - }, - { - "id": 4, - "title": "Data Manager Performance", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "targets": [ - { - "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"kubernetes-pods\", pod=~\".*datamanager.*\"}[5m])) by (le))", - "legendFormat": "95th percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"kubernetes-pods\", pod=~\".*datamanager.*\"}[5m])) by (le))", - "legendFormat": "50th percentile", - "refId": "B" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "s", - "min": 0 - } - } - }, - { - "id": 5, - "title": "Prediction Engine Machine Learning Metrics", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 12 - }, - "targets": [ - { - "expr": "predictions_generated_total{job=\"kubernetes-pods\", pod=~\".*predictionengine.*\"}", - "legendFormat": "Total Predictions", - "refId": "A" - }, - { - "expr": "rate(prediction_processing_duration_seconds_sum{job=\"kubernetes-pods\", pod=~\".*predictionengine.*\"}[5m])", - "legendFormat": "Processing Time", - "refId": "B" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "short", - "min": 0 - } - } - }, - { - "id": 6, - "title": "Position Manager Trading Activity", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "targets": [ - { - "expr": "positions_opened_total{job=\"kubernetes-pods\", pod=~\".*positionmanager.*\"}", - "legendFormat": "Positions Opened", - "refId": "A" - }, - { - "expr": "positions_closed_total{job=\"kubernetes-pods\", pod=~\".*positionmanager.*\"}", - "legendFormat": "Positions Closed", - "refId": "B" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "short", - "min": 0 - } - } - }, - { - "id": 7, - "title": "Resource Utilization", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 20 - }, - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"default\", pod=~\".*datamanager.*|.*predictionengine.*|.*positionmanager.*\"}[5m])) by (pod)", - "legendFormat": "{{pod}} CPU", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "percentunit", - "min": 0, - "max": 1 - } - } - }, - { - "id": 8, - "title": "Memory Usage", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 20 - }, - "targets": [ - { - "expr": "sum(container_memory_working_set_bytes{namespace=\"default\", pod=~\".*datamanager.*|.*predictionengine.*|.*positionmanager.*\"}) by (pod)", - "legendFormat": "{{pod}} Memory", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "bytes", - "min": 0 - } - } - }, - { - "id": 9, - "title": "Trading Schedule Events", - "type": "logs", - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 28 - }, - "targets": [ - { - "expr": "{namespace=\"default\", pod=~\".*datamanager.*|.*predictionengine.*|.*positionmanager.*\"} |~ \"schedule|trigger|cron\"", - "refId": "A" - } - ], - "options": { - "showTime": true, - "showLabels": false, - "showCommonLabels": true, - "wrapLogMessage": true, - "enableLogDetails": true - } - }, - { - "id": 10, - "title": "Error Rate", - "type": "stat", - "gridPos": { - "h": 4, - "w": 8, - "x": 0, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(http_requests_total{job=\"kubernetes-pods\", status=~\"5..\", pod=~\".*datamanager.*|.*predictionengine.*|.*positionmanager.*\"}[5m]))", - "legendFormat": "5xx Errors/sec", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "reqps", - "min": 0, - "thresholds": { - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "yellow", - "value": 0.1 - }, - { - "color": "red", - "value": 1 - } - ] - } - } - } - }, - { - "id": 11, - "title": "Knative Scale Events", - "type": "stat", - "gridPos": { - "h": 4, - "w": 8, - "x": 8, - "y": 34 - }, - "targets": [ - { - "expr": "sum(increase(knative_serving_revision_ready{namespace=\"default\"}[1h]))", - "legendFormat": "Scale Events", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "unit": "short", - "min": 0 - } - } - }, - { - "id": 12, - "title": "AWS Load Balancer Health", - "type": "stat", - "gridPos": { - "h": 4, - "w": 8, - "x": 16, - "y": 34 - }, - "targets": [ - { - "expr": "aws_applicationelb_target_response_time_average{load_balancer=~\".*pocketsizefund.*\"}", - "legendFormat": "ALB Response Time", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "ms", - "min": 0, - "thresholds": { - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "yellow", - "value": 100 - }, - { - "color": "red", - "value": 500 - } - ] - } - } - } - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - }, - { - "datasource": "prometheus", - "enable": true, - "expr": "changes(up{job=\"kubernetes-pods\", pod=~\".*datamanager.*|.*predictionengine.*|.*positionmanager.*\"}[5m]) > 0", - "iconColor": "red", - "name": "Service Restarts", - "titleFormat": "{{pod}} restarted", - "tagKeys": "pod,namespace" - } - ] - }, - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, - "datasource": "prometheus", - "definition": "label_values(up{job=\"kubernetes-pods\", namespace=\"default\"}, pod)", - "hide": 0, - "includeAll": true, - "multi": true, - "name": "pod", - "options": [], - "query": { - "query": "label_values(up{job=\"kubernetes-pods\", namespace=\"default\"}, pod)", - "refId": "prometheus-pod-Variable-Query" - }, - "refresh": 1, - "regex": ".*(datamanager|predictionengine|positionmanager).*", - "skipUrlSync": false, - "sort": 1, - "type": "query" - } - ] - } - }, - "meta": { - "type": "db", - "canSave": true, - "canEdit": true, - "canAdmin": true, - "canStar": true, - "slug": "pocketsizefund-infrastructure", - "url": "/d/pocketsizefund/pocketsizefund-infrastructure", - "expires": "0001-01-01T00:00:00Z", - "created": "2025-01-19T10:00:00Z", - "updated": "2025-01-19T10:00:00Z", - "updatedBy": "admin", - "createdBy": "admin", - "version": 1, - "hasAcl": false, - "isFolder": false, - "folderId": 0, - "folderTitle": "General", - "folderUrl": "", - "provisioned": false, - "provisionedExternalId": "" - } -} \ No newline at end of file diff --git a/infrastructure/ingress.py b/infrastructure/ingress.py deleted file mode 100644 index 199d17e8b..000000000 --- a/infrastructure/ingress.py +++ /dev/null @@ -1,114 +0,0 @@ -import pulumi -import pulumi_aws as aws -from tags import pulumi_tags - - -def create_application_load_balancer_security_group( - virtual_private_cloud: aws.ec2.Vpc, -) -> aws.ec2.SecurityGroup: - return aws.ec2.SecurityGroup( - resource_name="pocketsizefund-alb-security-group", - vpc_id=virtual_private_cloud.id, - ingress=[ - aws.ec2.SecurityGroupIngressArgs( - protocol="tcp", - from_port=80, - to_port=80, - cidr_blocks=["0.0.0.0/0"], - ), - aws.ec2.SecurityGroupIngressArgs( - protocol="tcp", - from_port=443, - to_port=443, - cidr_blocks=["0.0.0.0/0"], - ), - ], - egress=[ - aws.ec2.SecurityGroupEgressArgs( - protocol="-1", - from_port=0, - to_port=0, - cidr_blocks=["0.0.0.0/0"], - ) - ], - opts=pulumi.ResourceOptions( - depends_on=[virtual_private_cloud], - ), - tags=pulumi_tags, - ) - - -def create_application_load_balancer( - application_load_balancer_security_group: aws.ec2.SecurityGroup, - public_subnets: list[aws.ec2.Subnet], -) -> aws.lb.LoadBalancer: - return aws.lb.LoadBalancer( - resource_name="pocketsizefund-alb", - internal=False, - load_balancer_type="application", - security_groups=[application_load_balancer_security_group.id], - subnets=[subnet.id for subnet in public_subnets], - opts=pulumi.ResourceOptions( - depends_on=[ - application_load_balancer_security_group, - *public_subnets, - ], - ), - tags=pulumi_tags, - ) - - -def create_application_load_balancer_target_group( - virtual_private_cloud: aws.ec2.Vpc, - application_load_balancer: aws.lb.LoadBalancer, -) -> aws.lb.TargetGroup: - return aws.lb.TargetGroup( - resource_name="pocketsizefund-tg", - port=8080, # match service port - protocol="HTTP", - target_type="ip", - vpc_id=virtual_private_cloud.id, - health_check=aws.lb.TargetGroupHealthCheckArgs( - enabled=True, - healthy_threshold=3, - unhealthy_threshold=3, - interval=60, - path="/health", - port="8080", - protocol="HTTP", - timeout=10, - ), - opts=pulumi.ResourceOptions( - replace_on_changes=["*"], - depends_on=[ - virtual_private_cloud, - application_load_balancer, - ], - ), - tags=pulumi_tags, - ) - - -def create_application_load_balancer_listener( - application_load_balancer: aws.lb.LoadBalancer, - application_load_balancer_target_group: aws.lb.TargetGroup, -) -> aws.lb.Listener: - return aws.lb.Listener( - resource_name="pocketsizefund-listener", - load_balancer_arn=application_load_balancer.arn, - port=80, # publicly exposed port - protocol="HTTP", - default_actions=[ - aws.lb.ListenerDefaultActionArgs( - type="forward", - target_group_arn=application_load_balancer_target_group.arn, - ) - ], - opts=pulumi.ResourceOptions( - depends_on=[ - application_load_balancer, - application_load_balancer_target_group, - ], - ), - tags=pulumi_tags, - ) diff --git a/infrastructure/keys.py b/infrastructure/keys.py deleted file mode 100644 index 029b3202a..000000000 --- a/infrastructure/keys.py +++ /dev/null @@ -1,54 +0,0 @@ -import json - -import pulumi -import pulumi_aws as aws -from tags import pulumi_tags - - -def create_duckdb_user_access_key( - data_bucket_name: pulumi.Output[str], -) -> aws.iam.AccessKey: - duckdb_user = aws.iam.User( - resource_name="pocketsizefund-duckdb-user", - name="pocketsizefund-duckdb-user", - tags=pulumi_tags, - ) - - duckdb_policy = aws.iam.Policy( - resource_name="pocketsizefund-duckdb-policy", - name="pocketsizefund-duckdb-policy", - description="Policy for application service DuckDB access", - policy=json.dumps( - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:ListBucket", - "s3:PutObject", - "s3:DeleteObject", - ], - "Resource": [ - f"arn:aws:s3:::{data_bucket_name}/*", - f"arn:aws:s3:::{data_bucket_name}", - ], - } - ], - } - ), - tags=pulumi_tags, - ) - - aws.iam.UserPolicyAttachment( - resource_name="pocketsizefund-duckdb-user-policy", - user=duckdb_user.name, - policy_arn=duckdb_policy.arn, - ) - - return aws.iam.AccessKey( - resource_name="pocketsizefund-duckdb-user-access-key", - user=duckdb_user.name, - status="Active", - ) diff --git a/infrastructure/main.nu b/infrastructure/main.nu new file mode 100644 index 000000000..8e95ddd86 --- /dev/null +++ b/infrastructure/main.nu @@ -0,0 +1,57 @@ +def create-contexts [] { + do { + docker context rm -f pocketsizefund + docker context rm -f pocketsizefund-local + } | ignore + docker context create pocketsizefund --docker "host=ssh://pocketsizefund-swarm" + docker context create pocketsizefund-local +} + +def launch-stacks [context: string] { + docker context use $context + docker stack deploy -c stack.yml infrastructure --detach=false +} + +def "infrastructure up" [] { + ^pulumi up --yes + let manager_ip = (^pulumi stack output managerIp | str trim) + let pem = (^pulumi stack output --show-secrets sshPrivateKeyPem) + + $pem | save --raw --force swarm.pem + chmod 600 swarm.pem + + let ssh_cfg = $"($env.HOME)/.ssh/config" + let block = $" +Host pocketsizefund-swarm + HostName ($manager_ip) + User ubuntu + IdentityFile ($env.PWD)/swarm.pem + IdentitiesOnly yes + StrictHostKeyChecking accept-new +" + if (ls $ssh_cfg | is-empty) { + $block | save --raw --force $ssh_cfg +} else { + let cfg = (open --raw $ssh_cfg) + let filtered = ($cfg + | lines + | reduce -f [] { |it, acc| + if ($it | str starts-with "Host pocketsizefund-swarm") { $acc } else { $acc | append $it } + } + | str join (char nl)) + $filtered | save --raw --force $ssh_cfg + $"\n($block)" | save --raw --append $ssh_cfg + } + + ssh-keygen -R $manager_ip | ignore + ssh-keyscan -H $manager_ip | save --append $"($env.HOME)/.ssh/known_hosts" + + ssh pocketsizefund-swarm 'docker info -f "{{.ServerVersion}} {{.Swarm.LocalNodeState}}"' + + create-contexts + + launch-stacks pocketsizefund + launch-stacks pocketsizefund-local + + docker node ls +} diff --git a/infrastructure/monitors.py b/infrastructure/monitors.py deleted file mode 100644 index 5f41ea50e..000000000 --- a/infrastructure/monitors.py +++ /dev/null @@ -1,151 +0,0 @@ -import pulumi -import pulumi_aws as aws -import pulumi_eks as eks -from tags import pulumi_tags - - -def create_prometheus_scraper( - prometheus_workspace_arn: pulumi.Output[str], - kubernetes_cluster: eks.Cluster, - security_group: aws.ec2.SecurityGroup, -) -> aws.amp.Scraper: - scrape_configuration = pulumi.Output.json_dumps( - { - "global": { - "scrape_interval": "15m", - "evaluation_interval": "15m", - }, - "scrape_configs": [ - { - "job_name": "kubernetes-apiservers", - "kubernetes_sd_configs": [ - { - "role": "endpoints", - "api_server": kubernetes_cluster.core.endpoint, - } - ], - "scheme": "https", - "tls_config": { - "ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", # noqa: E501 - }, - "bearer_token_file": "/var/run/secrets/kubernetes.io/serviceaccount/token", # noqa: E501 - "relabel_configs": [ - { - "source_labels": [ - "__meta_kubernetes_namespace", - "__meta_kubernetes_service_name", - "__meta_kubernetes_endpoint_port_name", - ], - "action": "keep", - "regex": "default;kubernetes;https", - } - ], - }, - { - "job_name": "kubernetes-nodes", - "kubernetes_sd_configs": [ - { - "role": "node", - "api_server": kubernetes_cluster.core.endpoint, - } - ], - "scheme": "https", - "tls_config": { - "ca_file": "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", # noqa: E501 - }, - "bearer_token_file": "/var/run/secrets/kubernetes.io/serviceaccount/token", # noqa: E501 - "relabel_configs": [ - { - "action": "labelmap", - "regex": "__meta_kubernetes_node_label_(.+)", - }, - { - "target_label": "__address__", - "replacement": "kubernetes.default.svc:443", - }, - { - "source_labels": ["__meta_kubernetes_node_name"], - "regex": "(.+)", - "target_label": "__metrics_path__", - "replacement": "/api/v1/nodes/$1/proxy/metrics", - }, - ], - }, - { - "job_name": "kubernetes-pods", - "kubernetes_sd_configs": [ - { - "role": "pod", - "api_server": kubernetes_cluster.core.endpoint, - } - ], - "relabel_configs": [ - { - "source_labels": [ - "__meta_kubernetes_pod_annotation_prometheus_io_scrape" - ], - "action": "keep", - "regex": "true", - }, - { - "source_labels": [ - "__meta_kubernetes_pod_annotation_prometheus_io_path" - ], - "action": "replace", - "target_label": "__metrics_path__", - "regex": "(.+)", - }, - { - "source_labels": [ - "__address__", - "__meta_kubernetes_pod_annotation_prometheus_io_port", - ], - "action": "replace", - "regex": r"([^:]+)(?::\\d+)?;(\\d+)", - "replacement": "$1:$2", - "target_label": "__address__", - }, - { - "action": "labelmap", - "regex": "__meta_kubernetes_pod_label_(.+)", - }, - { - "source_labels": ["__meta_kubernetes_namespace"], - "action": "replace", - "target_label": "kubernetes_namespace", - }, - { - "source_labels": ["__meta_kubernetes_pod_name"], - "action": "replace", - "target_label": "kubernetes_pod_name", - }, - ], - }, - ], - } - ) - - return aws.amp.Scraper( - resource_name="pocketsizefund-prometheus-scraper", - alias="pocketsizefund-cluster-scraper", - scrape_configuration=scrape_configuration, - destination=aws.amp.ScraperDestinationArgs( - amp=aws.amp.ScraperDestinationAmpArgs( - workspace_arn=prometheus_workspace_arn, - ), - ), - source=aws.amp.ScraperSourceArgs( - eks=aws.amp.ScraperSourceEksArgs( - cluster_arn=kubernetes_cluster.eks_cluster.arn, - subnet_ids=kubernetes_cluster.eks_cluster.vpc_config.subnet_ids, - security_group_ids=[security_group.id], - ), - ), - opts=pulumi.ResourceOptions( - depends_on=[ - kubernetes_cluster, - security_group, - ], - ), - tags=pulumi_tags, - ) diff --git a/infrastructure/prometheus.yml b/infrastructure/prometheus.yml new file mode 100644 index 000000000..5fa95f88e --- /dev/null +++ b/infrastructure/prometheus.yml @@ -0,0 +1,16 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ['prometheus:9090'] + + - job_name: node-exporter + static_configs: + - targets: ['tasks.node-exporter:9100'] + + - job_name: cadvisor + static_configs: + - targets: ['tasks.cadvisor:8080'] diff --git a/infrastructure/pyproject.toml b/infrastructure/pyproject.toml index 4bc30b170..3d0938c99 100644 --- a/infrastructure/pyproject.toml +++ b/infrastructure/pyproject.toml @@ -1,12 +1,12 @@ [project] name = "infrastructure" -version = "20250716.1" +version = "0.1.0" +description = "Infrastructure management with Pulumi" +readme = "README.md" requires-python = "==3.12.10" dependencies = [ - "pulumi>=3.169.0", - "pulumi-aws>=6.0.0", - "pulumi-eks>=3.9.1", - "pulumi-docker>=3.0.0", - "pulumi-docker-build>=0.0.12", - "pulumi-kubernetes>=4.23.0", + "pulumi>=3.189.0", + "pulumi-aws>=7.4.0", + "pulumi-command>=1.1.0", + "pulumi-tls>=5.2.1", ] diff --git a/infrastructure/requirements.txt b/infrastructure/requirements.txt new file mode 100644 index 000000000..9a1b8d0fb --- /dev/null +++ b/infrastructure/requirements.txt @@ -0,0 +1,2 @@ +pulumi>=3.0.0,<4.0.0 +pulumi-aws>=7.0.0,<8.0.0 diff --git a/infrastructure/services.py b/infrastructure/services.py deleted file mode 100644 index 8b54bb553..000000000 --- a/infrastructure/services.py +++ /dev/null @@ -1,345 +0,0 @@ -from typing import Any - -import pulumi -import pulumi_aws as aws -import pulumi_docker_build as docker_build -import pulumi_kubernetes as k8s - - -def create_service_environment_variables( - inputs: list[tuple[str, Any]], -) -> pulumi.Output[dict[str, str]]: - return pulumi.Output.all(*inputs).apply(lambda secrets: dict(secrets)) - - -def create_knative_serving_core( - kubernetes_provider: k8s.Provider, -) -> k8s.yaml.v2.ConfigGroup: - knative_serving_namespace = k8s.core.v1.Namespace( - resource_name="pocketsizefund-knative-serving-namespace", - metadata={"name": "knative-serving"}, - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[kubernetes_provider], - ), - ) - - knative_serving_crds = k8s.yaml.v2.ConfigGroup( # custom resource definition - resource_name="pocketsizefund-knative-serving-crds", - files=[ - "https://github.com/knative/serving/releases/download/knative-v1.12.0/serving-crds.yaml" - ], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - knative_serving_namespace, - ], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - knative_serving_core = k8s.yaml.v2.ConfigGroup( - resource_name="pocketsizefund-knative-serving-core", - files=[ - "https://github.com/knative/serving/releases/download/knative-v1.12.0/serving-core.yaml" - ], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - knative_serving_namespace, - knative_serving_crds, - ], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - # NOTE: check if this or its specific configurations are necessary - k8s.core.v1.ConfigMap( - resource_name="pocketsizefund-knative-configuration-network", - metadata=k8s.meta.v1.ObjectMetaArgs( - name="config-network", - namespace="knative-serving", - ), - data={ - "autocreate-cluster-domain-claims": "false", - "auto-tls": "false", - "http-protocol": "Redirected", - }, - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - knative_serving_namespace, - knative_serving_core, - knative_serving_crds, - ], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - return knative_serving_core - - -def create_knative_eventing_core( - kubernetes_provider: k8s.Provider, -) -> k8s.yaml.v2.ConfigGroup: - knative_eventing_namespace = k8s.core.v1.Namespace( - resource_name="pocketsizefund-eventing-namespace", - metadata={"name": "knative-eventing"}, - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[kubernetes_provider], - ), - ) - - knative_eventing_crds = k8s.yaml.v2.ConfigGroup( - resource_name="pocketsizefund-knative-eventing-crds", - files=[ - "https://github.com/knative/eventing/releases/download/knative-v1.12.0/eventing-crds.yaml" - ], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - knative_eventing_namespace, - ], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - return k8s.yaml.v2.ConfigGroup( - resource_name="pocketsizefund-knative-eventing-core", - files=[ - "https://github.com/knative/eventing/releases/download/knative-v1.12.0/eventing-core.yaml" - ], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - knative_eventing_namespace, - knative_eventing_crds, - ], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - -def create_knative_broker( - kubernetes_provider: k8s.Provider, - knative_eventing_core: k8s.yaml.v2.ConfigGroup, -) -> k8s.yaml.v2.ConfigGroup: - content = { - "apiVersion": "eventing.knative.dev/v1", - "kind": "Broker", - "metadata": { - "name": "default", - "namespace": "default", - }, - } - - return k8s.yaml.v2.ConfigGroup( - resource_name="pocketsizefund-default-broker", - objs=[content], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[kubernetes_provider, knative_eventing_core], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - -def create_knative_service( # noqa: PLR0913 - kubernetes_provider: k8s.Provider, - service_name: str, - image: docker_build.Image, - application_load_balancer_service_target_group: aws.lb.TargetGroup, - knative_serving_core: k8s.yaml.v2.ConfigGroup, - environment_variables: pulumi.Output[dict[str, str]] | None = None, -) -> k8s.yaml.v2.ConfigGroup: - formatted_environment_variables = ( - environment_variables.apply( - lambda env_vars: [ - {"name": key, "value": value} for key, value in env_vars.items() - ] - ) - if environment_variables - else [] - ) - - content = { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": service_name, - "namespace": "default", - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "alb.ingress.kubernetes.io/scheme": "internet-facing", - "alb.ingress.kubernetes.io/target-type": "ip", - "alb.ingress.kubernetes.io/target-group-arn": application_load_balancer_service_target_group.arn, # noqa: E501 - "prometheus.io/scrape": "true", - "prometheus.io/path": "/metrics", - "prometheus.io/port": "8080", - } - }, - "spec": { - "containers": [ - { - "image": image.ref, - "name": service_name, - "env": formatted_environment_variables, - "resources": { - "requests": {"cpu": "100m", "memory": "128Mi"}, - "limits": {"cpu": "1000m", "memory": "512Mi"}, - }, - "ports": [ - {"containerPort": 8080}, - ], - } - ] - }, - } - }, - } - - return k8s.yaml.v2.ConfigGroup( - resource_name=f"pocketsizefund-knative-service-{service_name}", - objs=[content], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[ - kubernetes_provider, - image, - application_load_balancer_service_target_group, - knative_serving_core, - ], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - -def create_knative_trigger( - kubernetes_provider: k8s.Provider, - source_service_name: str, - source_attribute_type: str, - target_service_name: str, - knative_eventing_core: k8s.yaml.v2.ConfigGroup, -) -> k8s.yaml.v2.ConfigGroup: - resource_name = ( - f"pocketsizefund-{source_service_name}-to-{target_service_name}-trigger" - ) - - content = { - "apiVersion": "eventing.knative.dev/v1", - "kind": "Trigger", - "metadata": { - "name": str(resource_name), - "namespace": "default", - }, - "spec": { - "broker": "default", - "filter": { - "attributes": { - "type": source_attribute_type, # dot separated - }, - }, - "subscriber": { - "ref": { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "name": target_service_name, - } - }, - }, - } - - return k8s.yaml.v2.ConfigGroup( - resource_name=str(resource_name), - objs=[content], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[kubernetes_provider, knative_eventing_core], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) - - -def create_knative_schedule( - kubernetes_provider: k8s.Provider, - target_service_name: str, - target_path: str, - cron_schedule: str, - knative_eventing_core: k8s.yaml.v2.ConfigGroup, -) -> k8s.yaml.v2.ConfigGroup: - content = { - "apiVersion": "sources.knative.dev/v1", - "kind": "PingSource", - "metadata": { - "name": f"{target_service_name}-pingsource", - "namespace": "default", - }, - "spec": { - "schedule": cron_schedule, - "sink": { - "ref": { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "name": target_service_name, - }, - "uri": target_path, - }, - }, - } - - formated_cron_schedule = cron_schedule.replace(" ", "-") - - return k8s.yaml.v2.ConfigGroup( - resource_name=f"{target_service_name}-{formated_cron_schedule}-schedule", - objs=[content], - opts=pulumi.ResourceOptions( - provider=kubernetes_provider, - depends_on=[kubernetes_provider, knative_eventing_core], - custom_timeouts=pulumi.CustomTimeouts( - create="2m", - update="2m", - delete="2m", - ), - ), - ) diff --git a/infrastructure/stack.yml b/infrastructure/stack.yml new file mode 100644 index 000000000..72034a416 --- /dev/null +++ b/infrastructure/stack.yml @@ -0,0 +1,138 @@ +version: "3.9" + +networks: + public: + driver: overlay + attachable: true + internal: + driver: overlay + attachable: true + +volumes: + traefik_letsencrypt: + portainer_data: + prom_data: + grafana_data: + +configs: + prometheus.yml: + name: prometheus.yml + file: ./prometheus.yml + +services: + # Network Services + traefik: + image: traefik:v3.1 + command: + - --providers.swarm.endpoint=unix:///var/run/docker.sock + - --providers.swarm.exposedByDefault=false + - --entrypoints.web.address=:80 + - --entrypoints.websecure.address=:443 + - --api.dashboard=true + - --metrics.prometheus=true + - --metrics.prometheus.addEntryPointsLabels=true + - --metrics.prometheus.addServicesLabels=true + ports: + - target: 80 + published: 80 + protocol: tcp + mode: host + - target: 443 + published: 443 + protocol: tcp + mode: host + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + deploy: + placement: + constraints: [node.role == manager] + update_config: + parallelism: 1 + order: start-first + networks: [public] + + portainer: + image: portainer/portainer-ce:latest + ports: + - "9000:9000" + - "9443:9443" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - portainer_data:/data + networks: + - public + deploy: + placement: + constraints: + - node.role == manager + + # Monitoring Services + prometheus: + image: prom/prometheus:v2.52.0 + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-lifecycle + volumes: + - prom_data:/prometheus + configs: + - source: prometheus.yml + target: /etc/prometheus/prometheus.yml + deploy: + placement: + constraints: [node.role == manager] + resources: + reservations: { memory: 256M } + limits: { memory: 1G } + networks: [internal] + + node-exporter: + image: prom/node-exporter:v1.8.1 + command: + - --path.rootfs=/host + - --collector.filesystem.mount-points-exclude=^/(dev|proc|run|sys|var/lib/docker/.+|var/lib/kubelet/.+|var/lib/containers/.+)($$|/) + - --no-collector.hwmon + deploy: + mode: global + resources: + reservations: { memory: 64M } + limits: { memory: 256M } + volumes: + - /:/host:ro,rslave + networks: [internal] + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.47.2 + deploy: + mode: global + resources: + reservations: { memory: 128M } + limits: { memory: 512M } + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + networks: [internal] + + grafana: + image: grafana/grafana:10.4.2 + environment: + GF_SECURITY_ADMIN_USER: admin + # TODO: Change this to a random password + GF_SECURITY_ADMIN_PASSWORD: admin123 + volumes: + - grafana_data:/var/lib/grafana + deploy: + placement: + constraints: [node.role == manager] + resources: + reservations: { memory: 128M } + limits: { memory: 512M } + labels: + - traefik.enable=true + - traefik.http.routers.grafana.rule=Host(`grafana.example.com`) + - traefik.http.routers.grafana.entrypoints=web + - traefik.http.services.grafana.loadbalancer.server.port=3000 + networks: [public, internal] \ No newline at end of file diff --git a/infrastructure/tags.py b/infrastructure/tags.py deleted file mode 100644 index 225ee150d..000000000 --- a/infrastructure/tags.py +++ /dev/null @@ -1,13 +0,0 @@ -import pulumi - -pulumi_tags = { - "project": "pocketsizefund", - "manager": "pulumi", - "stack": pulumi.get_stack(), -} - -manual_tags = { - "project": "pocketsizefund", - "manager": "manual", - "stack": "none", -} diff --git a/infrastructure/upload_grafana_dashboard.nu b/infrastructure/upload_grafana_dashboard.nu deleted file mode 100644 index c7362d284..000000000 --- a/infrastructure/upload_grafana_dashboard.nu +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env nu - -def main [ - --dashboard-file: string = "grafana-dashboard.json" - --workspace-id: string = "" - --region: string = "us-east-1" - --profile: string = "pulumi" - --dry-run = false -] { - check_prerequisites - - print "Pocket Size Fund Grafana Dashboard Upload" - print "================================================" - - if not ($dashboard_file | path exists) { - print $"Dashboard file not found: ($dashboard_file)" - exit 1 - } - - print $"Using dashboard file: ($dashboard_file)" - - let workspace_id = if ($workspace_id | is-empty) { - print "Getting Grafana workspace ID from AWS..." - try { - let result = (aws grafana list-workspaces - --region $region - --profile $profile - --query 'workspaces[?name==`pocketsizefund`]' - --output json - | from json | get 0.id ) - if ($result | is-empty) { - print "No workspace found with name 'pocketsizefund'" - exit 1 - } - $result - } catch { - print "Failed to get workspace ID. Provide --workspace-id or ensure workspace exists" - exit 1 - } - } else { - $workspace_id - } - - if ($workspace_id | is-empty) { - print "No Grafana workspace found named 'pocketsizefund'" - print "Create workspace first or provide specific --workspace-id" - exit 1 - } - - print $"Target workspace ID: ($workspace_id)" - - print "Getting Grafana workspace endpoint..." - let workspace_info = try { - (aws grafana describe-workspace - --workspace-id $workspace_id - --region $region - --profile $profile - --output json - | from json) - } catch { - print $"Failed to describe workspace: ($workspace_id)" - exit 1 - } - - let grafana_endpoint = $workspace_info.workspace.endpoint - print $"Grafana endpoint: ($grafana_endpoint)" - - print "Loading dashboard configuration..." - let dashboard_content = try { - open $dashboard_file | from json - } catch { - print $"Invalid JSON in dashboard file: ($dashboard_file)" - exit 1 - } - - let upload_payload = { - dashboard: $dashboard_content.dashboard - overwrite: true - message: $"Uploaded via Nu script at (date now | format date '%Y-%m-%d %H:%M:%S')" - } - - if $dry_run { - print "DRY RUN MODE - Dashboard payload preview:" - print ($upload_payload | to json) - print "\nDry run completed. Use without --dry-run to upload." - exit 0 - } - - print "Creating Grafana API key..." - let api_key_response = try { - (aws grafana create-workspace-api-key - --workspace-id $workspace_id - --key-name $"pocketsizefund-upload-($workspace_id)-(date now | format date '%Y%m%d-%H%M%S')" - --key-role ADMIN - --seconds-to-live 3600 - --region $region - --profile $profile - --output json - | from json) - } catch { - print "Failed to create API key. Check permissions." - exit 1 - } - - let api_key = $api_key_response.key - print "API key created successfully" - - print "Uploading dashboard to Grafana..." - let upload_result = try { - (http post $"($grafana_endpoint)/api/dashboards/db" - -H [ - "Authorization" $"Bearer ($api_key)" - "Content-Type" "application/json" - ] - ($upload_payload | to json)) - } catch { - print "Failed to upload dashboard" - - try { - (aws grafana delete-workspace-api-key - --workspace-id $workspace_id - --key-name $api_key_response.keyName - --region $region - --profile $profile - | ignore) - } catch { - # Ignore cleanup errors - } - - exit 1 - } - - let result = try { - $upload_result | from json - } catch { - print "Upload may have succeeded but response parsing failed" - print $"Response: ($upload_result)" - } - - if ($result | get status? | default "unknown") == "success" { - print "Dashboard uploaded successfully!" - print $"Dashboard URL: ($grafana_endpoint)/d/($result.slug)" - print $"Dashboard ID: ($result.id)" - print $"Version: ($result.version)" - } else { - print "Upload completed with unknown status" - print $"Response: ($result)" - } - - print "Cleaning up temporary API key..." - try { - (aws grafana delete-workspace-api-key - --workspace-id $workspace_id - --key-name $api_key_response.keyName - --region $region - --profile $profile - | ignore) - print "API key cleaned up" - } catch { - print "Failed to clean up API key (manual deletion may be needed)" - } - - print "\nDashboard upload completed!" - print $"Access your dashboard at: ($grafana_endpoint)" -} - -def check_prerequisites [] { - print "Checking prerequisites..." - - try { - aws --version | ignore - print "AWS CLI available" - } catch { - print "AWS CLI not found. Please install AWS CLI." - exit 1 - } - - try { - which jq | ignore - print "jq available" - } catch { - print "jq not found (optional but recommended for JSON debugging)" - } - - print "Prerequisites check completed\n" -} \ No newline at end of file diff --git a/infrastructure/vpc.py b/infrastructure/vpc.py deleted file mode 100644 index d61e186c4..000000000 --- a/infrastructure/vpc.py +++ /dev/null @@ -1,134 +0,0 @@ -import pulumi -import pulumi_aws as aws -from tags import pulumi_tags - - -def create_virtual_private_cloud() -> aws.ec2.Vpc: - return aws.ec2.Vpc( - resource_name="pocketsizefund-vpc", - cidr_block="10.0.0.0/16", - enable_dns_support=True, - enable_dns_hostnames=True, - tags=pulumi_tags, - ) - - -def create_internet_gateway( - virtual_private_cloud: aws.ec2.Vpc, -) -> aws.ec2.InternetGateway: - return aws.ec2.InternetGateway( - resource_name="pocketsizefund-internet-gateway", - vpc_id=virtual_private_cloud.id, - opts=pulumi.ResourceOptions(depends_on=[virtual_private_cloud]), - tags=pulumi_tags, - ) - - -def create_elastic_ip(virtual_private_cloud: aws.ec2.Vpc) -> aws.ec2.Eip: - return aws.ec2.Eip( - resource_name="pocketsizefund-elastic-ip", - opts=pulumi.ResourceOptions( - depends_on=[virtual_private_cloud], - ), - tags=pulumi_tags, - ) - - -def create_nat_gateway( - elastic_ip: aws.ec2.Eip, - public_subnet: aws.ec2.Subnet, -) -> aws.ec2.NatGateway: - return aws.ec2.NatGateway( - resource_name="pocketsizefund-nat-gateway", - allocation_id=elastic_ip.id, - subnet_id=public_subnet.id, - tags=pulumi_tags, - opts=pulumi.ResourceOptions( - depends_on=[ - elastic_ip, - public_subnet, - ], - ), - ) - - -def create_route_table( - virtual_private_cloud: aws.ec2.Vpc, - internet_gateway: aws.ec2.InternetGateway | None = None, - nat_gateway: aws.ec2.NatGateway | None = None, -) -> aws.ec2.RouteTable: - depends_on: list[pulumi.Resource] = [virtual_private_cloud] - if internet_gateway: - depends_on.append(internet_gateway) - if nat_gateway: - depends_on.append(nat_gateway) - - if internet_gateway and nat_gateway: - message = "Cannot specify both internet_gateway and nat_gateway" - raise ValueError(message) - if not internet_gateway and not nat_gateway: - message = "Must specify either internet_gateway or nat_gateway" - raise ValueError(message) - - visibility = "public" if internet_gateway else "private" - - return aws.ec2.RouteTable( - resource_name=f"pocketsizefund-{visibility}-route-table", - vpc_id=virtual_private_cloud.id, - routes=[ - aws.ec2.RouteTableRouteArgs( - cidr_block="0.0.0.0/0", - gateway_id=internet_gateway.id if internet_gateway else None, - nat_gateway_id=nat_gateway.id if nat_gateway else None, - ) - ], - opts=pulumi.ResourceOptions(depends_on=depends_on), - tags=pulumi_tags, - ) - - -def create_subnet( - virtual_private_cloud: aws.ec2.Vpc, - route_table: aws.ec2.RouteTable, - availability_zone: str, - subnet_number: int, - visibility: str = "public", -) -> aws.ec2.Subnet: - minimum_subnet_number = 0 - maximum_subnet_number = 255 - - if not minimum_subnet_number <= subnet_number <= maximum_subnet_number: - message = f"subnet_number must be between 0 and 255, got {subnet_number}" - raise ValueError(message) - - visibility = visibility.lower() - - subnet = aws.ec2.Subnet( - resource_name=f"pocketsizefund-{visibility}-subnet-{subnet_number}", - vpc_id=virtual_private_cloud.id, - cidr_block=f"10.0.{subnet_number}.0/24", - availability_zone=availability_zone, - map_public_ip_on_launch=visibility == "public", - tags=pulumi_tags, - opts=pulumi.ResourceOptions( - depends_on=[ - virtual_private_cloud, - route_table, - ], - ), - ) - - aws.ec2.RouteTableAssociation( - resource_name=f"pocketsizefund-{visibility}-route-table-subnet-association-{subnet_number}", - subnet_id=subnet.id, - route_table_id=route_table.id, - opts=pulumi.ResourceOptions( - depends_on=[ - virtual_private_cloud, - subnet, - route_table, - ], - ), - ) - - return subnet diff --git a/uv.lock b/uv.lock index e291f3446..28444b30d 100644 --- a/uv.lock +++ b/uv.lock @@ -34,7 +34,7 @@ wheels = [ [[package]] name = "aiobotocore" -version = "2.24.0" +version = "2.24.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -45,9 +45,9 @@ dependencies = [ { name = "python-dateutil" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b2/ca/ac82c0c699815b6d5b4017f3d8fb2c2d49537f4937f4a0bdf58b4c75d321/aiobotocore-2.24.0.tar.gz", hash = "sha256:b32c0c45d38c22a18ce395a0b5448606c5260603296a152895b5bdb40ab3139d", size = 119597, upload-time = "2025-08-08T18:26:50.373Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/02/b4ed1af4b3437c2fc6e6111e7fdee011b34cf1c0cc8f314474f843e10019/aiobotocore-2.24.1.tar.gz", hash = "sha256:59237f1b2d4ff619f9a9e78360b691d59b92fdd4d03d054dbd2eeff8ada5667e", size = 119754, upload-time = "2025-08-15T15:49:53.209Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/68/b29577197aa2e54b50d6f214524790cc1cb27d289585ad7c7bdfe5125285/aiobotocore-2.24.0-py3-none-any.whl", hash = "sha256:72bb1f8eb1b962779a95e1bcc9cf35bc33196ad763b622a40ae7fa9d2e95c87c", size = 84971, upload-time = "2025-08-08T18:26:48.777Z" }, + { url = "https://files.pythonhosted.org/packages/20/26/c3c93209084e24990ad1b4214f67dce1c0183454cec9cd2cad9433f493bb/aiobotocore-2.24.1-py3-none-any.whl", hash = "sha256:557922823455ca65bbd065b363b54846f16b9c4b6bd0b61ecdfa01ca13a04531", size = 85216, upload-time = "2025-08-15T15:49:51.442Z" }, ] [[package]]