diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 8bdb364..69eae4a 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -40,11 +40,11 @@ jobs: run: npm run build working-directory: apps/dashboard env: - VITE_AUTH0_DOMAIN: ${{ secrets.VITE_AUTH0_DOMAIN }} - VITE_AUTH0_CLIENT_ID: ${{ secrets.VITE_AUTH0_CLIENT_ID }} - VITE_AUTH0_AUDIENCE: ${{ secrets.VITE_AUTH0_AUDIENCE }} - VITE_BFF_URL: ${{ secrets.E2E_BFF_URL }} - VITE_GATEWAY_URL: ${{ secrets.E2E_GATEWAY_URL }} + VITE_AUTH0_DOMAIN: e2e.auth0.local + VITE_AUTH0_CLIENT_ID: e2e-client-id + VITE_AUTH0_AUDIENCE: https://api.grainguard.test + VITE_BFF_URL: http://localhost:4000/graphql + VITE_GATEWAY_URL: http://localhost:3000 - name: Serve dashboard run: npx serve -s dist -l 5173 & @@ -58,8 +58,8 @@ jobs: working-directory: tests/e2e env: E2E_BASE_URL: http://localhost:5173 - VITE_AUTH0_CLIENT_ID: ${{ secrets.VITE_AUTH0_CLIENT_ID }} - VITE_AUTH0_AUDIENCE: ${{ secrets.VITE_AUTH0_AUDIENCE }} + VITE_AUTH0_CLIENT_ID: e2e-client-id + VITE_AUTH0_AUDIENCE: https://api.grainguard.test - name: Upload Playwright report uses: actions/upload-artifact@v4 diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml index a7b0e9c..048740a 100644 --- a/.github/workflows/perf.yml +++ b/.github/workflows/perf.yml @@ -69,6 +69,8 @@ jobs: working-directory: apps/gateway env: PORT: 3000 + BFF_HOST: localhost + BFF_PORT: "4000" NODE_ENV: test AUTH_ENABLED: "false" DATABASE_URL: postgres://grainguard:grainguard@localhost:5432/grainguard @@ -168,8 +170,17 @@ jobs: --env BFF_URL=http://localhost:4000 \ --env GATEWAY_AUTH_DISABLED=true \ --env BFF_AUTH_DISABLED=true \ - --env JWT=dummy-jwt \ + --env GATEWAY_SAMPLE_PATH=/health \ --env TEST_DEVICE_ID=00000000-0000-0000-0000-000000000001 \ + --env BASELINE_RATE=20 \ + --env BASELINE_DURATION=30s \ + --env BASELINE_PREALLOCATED_VUS=20 \ + --env BASELINE_MAX_VUS=40 \ + --env SPIKE_TARGET=40 \ + --env SPIKE_RAMP_UP=15s \ + --env SPIKE_HOLD=15s \ + --env SPIKE_RAMP_DOWN=15s \ + --env THINK_TIME_SECONDS=0.05 \ scripts/load-tests/performance-budget.js - name: Upload performance results diff --git a/README.md b/README.md index aa9556f..4c4fb08 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > Production-grade, polyglot microservices SaaS platform for grain and agri operations. -GrainGuard ingests high-volume device telemetry, computes spoilage risk scores, triggers automated alert workflows, and ships with full multi-tenant billing, SSO, team management, audit logging, observability, CI/CD, chaos testing, SLO monitoring, and operational runbooks. +GrainGuard ingests high-volume device telemetry, computes spoilage risk scores, triggers automated alert workflows, and ships with multi-tenant billing, SSO, team management, audit logging, observability, CI/CD, load testing, and operational runbooks. --- @@ -81,6 +81,18 @@ Risk Engine (Python) ── Workflow Alerts (Node.js) ── RabbitMQ ── Job --- +## Current Deployment Status + +| Area | State | +|------|-------| +| Local Docker stack | ✅ Validated end-to-end | +| GitOps apps in repo | ✅ `dev`, `staging`, and `prod` ArgoCD apps committed | +| Terraform environments in repo | ✅ `dev` and `staging` committed | +| Dedicated staging environment | 🟡 Scaffold committed; deploy/validate next | +| Production rollout strategy | 🟡 Safe rolling deploys now; canary planned for production | + +--- + ## SaaS Features | Feature | Status | @@ -172,18 +184,22 @@ go run tools/publish-telemetry/main.go # Go unit + integration tests go test -race -count=1 ./... +# Go lint +go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.11.0 run --timeout=5m + # k6 load tests (requires running stack) k6 run tests/load/spike.js k6 run tests/load/soak.js k6 run tests/load/stress.js -# Chaos tests (requires kubectl + live cluster) -bash tests/chaos/run-all.sh - # Replay + idempotency test ./scripts/replay/replay_test.sh ``` +Note: +- The core load-test scripts above are committed in `tests/load/`. +- Cluster-level chaos automation is not currently committed on `master`; add or restore it before relying on README-driven chaos drills. + ## Code Review Automation This repository is preconfigured for CodeRabbit via [`/.coderabbit.yaml`](./.coderabbit.yaml). @@ -215,35 +231,15 @@ Notes: --- -## Chaos Testing - -Five experiments covering the critical failure modes: - -| Experiment | What it kills | Pass condition | -|------------|--------------|----------------| -| `pod-kill` | gateway, bff, telemetry-service pods | Respawns within 30s | -| `kafka-consumer-pause` | read-model-builder + cdc-transformer | Lag ≤ 10 000 within 5 min | -| `redis-outage` | Redis | BFF falls back to DB, no panics | -| `projection-lag` | read-model-builder | Alert fires, lag recovers in 5 min | -| `network-partition` | telemetry-service → Kafka egress | Messages buffered, delivered after heal | - -```bash -# Run all experiments -bash tests/chaos/run-all.sh - -# Or trigger via GitHub Actions (manual dispatch) -# .github/workflows/chaos.yml — also runs weekly on Saturdays -``` - ---- - ## Operational Runbooks | Runbook | Trigger | |---------|---------| +| [Postgres Backup / Restore](docs/runbooks/postgres-backup-restore.md) | Backup verification, restore drill, data recovery | | [Postgres Failover](docs/runbooks/postgres-failover.md) | Primary down, replica lag high | | [Kafka Loss](docs/runbooks/kafka-loss.md) | Broker down, under-replicated partitions | | [DLQ Spike](docs/runbooks/dlq-spike.md) | `DLQMessagesAccumulating` alert | +| [Redis Backup / Restore](docs/runbooks/redis-backup-restore.md) | Cache restore drill, persistence recovery | | [Redis Failover](docs/runbooks/redis-failover.md) | Cache miss 100%, lock timeouts | | [Projection Lag](docs/runbooks/projection-lag.md) | `ProjectionLagHigh` alert | | [gRPC Outage](docs/runbooks/grpc-outage.md) | Circuit breaker open, 503 upstream | @@ -261,6 +257,8 @@ terraform apply -var="db_password=yourpassword" Provisions: VPC · EKS · RDS Postgres · Elasticache Redis · MSK Kafka · DynamoDB · ECR · Secrets Manager +Today, `dev` and `staging` Terraform environments are committed in-repo. The next step is to deploy and validate `staging` before treating the rollout path as production-ready. + --- ## Kubernetes (GitOps) @@ -278,6 +276,14 @@ helm diff upgrade grainguard k8s/helm/grainguard \ ArgoCD watches `k8s/argocd/apps/` and auto-syncs on every push to master. +Committed applications today: +- `grainguard-dev` -> `grainguard-dev` +- `grainguard-staging` -> `grainguard-staging` +- `grainguard-prod` -> `grainguard-prod` + +Recommended next environment: +- `grainguard-staging` -> deploy and validate ingress, TLS, DNS, secrets, restore drills, and production-like auth/billing flows before first prod rollout + --- ## Architecture Decision Records @@ -303,21 +309,26 @@ ArgoCD watches `k8s/argocd/apps/` and auto-syncs on every push to master. |-------|------|--------| | R1 — Core loop | Ingest, CQRS, outbox, saga | ✅ Done | | R2 — CDC + Search | Debezium, Elasticsearch, RabbitMQ | ✅ Done | -| R3 — Reliability | Helm, ArgoCD, k6 load tests, chaos tests | ✅ Done | +| R3 — Reliability baseline | Helm, ArgoCD scaffolding, k6 load tests, runbooks | ✅ Done | | R4 — Observability | SLOs, burn-rate alerts, Grafana dashboard, runbooks | ✅ Done | | R5 — Security | CSRF, rate limiting, audit logging, RBAC, API keys | ✅ Done | | R6 — SaaS billing | Stripe, tenant onboarding, team management, SSO, webhooks | ✅ Done | -| R7 — DB migrations | Flyway/Knex migration framework, schema versioning | 🔜 Next | -| R8 — Secret management | HashiCorp Vault / AWS Secrets Manager integration | 🔜 Planned | +| R7 — Staging environment | Dedicated Argo app, Terraform env, deployed validation | 🟡 Scaffolded | +| R8 — Production hardening | Canary rollout, restore proof, deployed auth/webhook validation | 🔜 Next | --- -## Load test results +## Latest Local Validation + +Latest mixed read/write validation on `master` (local Docker stack): -- Kafka ingest: **1,700 events/sec** -- Gateway p95 latency: **5.89ms** -- Read model builder: **2,500–3,000 events/sec** sustained +- **35,077** total requests +- **438 req/s** aggregate throughput +- **0%** HTTP failure rate +- Gateway GraphQL p95: **11.5 ms** +- Ingest p95: **10.8 ms** +- Kafka consumer groups drained back to **0 lag** after the run --- -*Built to demonstrate end-to-end DDIA patterns, distributed systems, GitOps, SRE practices, and production multi-tenant SaaS architecture.* +*Built to demonstrate end-to-end DDIA patterns, distributed systems, GitOps, SRE practices, and production-style multi-tenant SaaS architecture.* diff --git a/apps/bff/src/datasources/postgres.ts b/apps/bff/src/datasources/postgres.ts index 4edfeaf..3ee78aa 100644 --- a/apps/bff/src/datasources/postgres.ts +++ b/apps/bff/src/datasources/postgres.ts @@ -4,17 +4,38 @@ import { cache } from "./redis"; const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; +function shouldUseInsecureTls(connectionString: string | undefined): boolean { + if (!connectionString) return false; + return /sslmode=(require|verify-ca|verify-full)/i.test(connectionString); +} + +function buildPoolOptions() { + const connectionString = + process.env.READ_DATABASE_URL || + `postgres://${process.env.READ_DB_USER ?? "postgres"}:${process.env.READ_DB_PASSWORD ?? "postgres"}@${process.env.READ_DB_HOST ?? "postgres-read"}:${process.env.READ_DB_PORT ?? "5432"}/${process.env.READ_DB_NAME ?? "grainguard_read"}`; + + const rejectUnauthorized = + process.env.READ_DB_SSL_REJECT_UNAUTHORIZED !== "false"; + + return { + connectionString, + max: 50, + ...(shouldUseInsecureTls(connectionString) + ? { + ssl: { + rejectUnauthorized, + }, + } + : {}), + }; +} + /** Returns true for valid UUID v4 strings — guards against bad JWT claims */ export function isValidUuid(value: unknown): value is string { return typeof value === "string" && UUID_RE.test(value); } -const pool = new Pool({ - connectionString: - process.env.READ_DATABASE_URL || - `postgres://${process.env.READ_DB_USER ?? "postgres"}:${process.env.READ_DB_PASSWORD ?? "postgres"}@${process.env.READ_DB_HOST ?? "postgres-read"}:${process.env.READ_DB_PORT ?? "5432"}/${process.env.READ_DB_NAME ?? "grainguard_read"}`, - max: 50, -}); +const pool = new Pool(buildPoolOptions()); type Row = Record; type QueryResult = import("pg").QueryResult; diff --git a/apps/bff/src/datasources/redis.ts b/apps/bff/src/datasources/redis.ts index 69aa3fc..9400e6b 100644 --- a/apps/bff/src/datasources/redis.ts +++ b/apps/bff/src/datasources/redis.ts @@ -3,6 +3,7 @@ import { createClient, createCluster } from "redis"; // REDIS_CLUSTER_NODES = "redis-cluster-0:6379,redis-cluster-1:6379,..." // When set, uses Redis Cluster. Otherwise falls back to single-node (local dev). const REDIS_CLUSTER_NODES = process.env.REDIS_CLUSTER_NODES; +const REDIS_PASSWORD = process.env.REDIS_PASSWORD; const client = (() => { if (REDIS_CLUSTER_NODES) { @@ -13,7 +14,10 @@ const client = (() => { }; }); console.log(`Redis cluster mode: ${rootNodes.length} nodes`); - return createCluster({ rootNodes }); + return createCluster({ + rootNodes, + defaults: REDIS_PASSWORD ? { password: REDIS_PASSWORD } : undefined, + }); } // Single-node (local dev / docker-compose default) @@ -23,6 +27,7 @@ const client = (() => { host: process.env.REDIS_HOST || "localhost", port: parseInt(process.env.REDIS_PORT || "6379", 10), }, + password: REDIS_PASSWORD || undefined, }); })(); diff --git a/apps/bff/src/server.ts b/apps/bff/src/server.ts index 615eaef..476aa74 100644 --- a/apps/bff/src/server.ts +++ b/apps/bff/src/server.ts @@ -22,13 +22,27 @@ const ISSUER = process.env.JWT_ISSUER!; const AUDIENCE = process.env.JWT_AUDIENCE!; const ALLOWED_ORIGINS = (process.env.ALLOWED_ORIGINS || - "http://localhost:5173,http://localhost:5174,http://localhost:8086").split(","); + "http://localhost:5173,http://localhost:5174,http://localhost:8086") + .split(",") + .map((origin) => origin.trim()); if (!JWKS_URL || !ISSUER || !AUDIENCE) { throw new Error("JWKS_URL, JWT_ISSUER, JWT_AUDIENCE must be set"); } const jwks = createRemoteJWKSet(new URL(JWKS_URL)); +function isAllowedOrigin(origin: string): boolean { + return ALLOWED_ORIGINS.some((allowedOrigin) => { + if (allowedOrigin === origin) return true; + if (!allowedOrigin.includes("*")) return false; + + const pattern = new RegExp( + `^${allowedOrigin.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*")}$` + ); + return pattern.test(origin); + }); +} + async function verifyToken(token: string) { const { payload } = await jwtVerify(token, jwks, { issuer: ISSUER, @@ -133,7 +147,7 @@ async function startServer() { cors({ origin: (origin, callback) => { if (!origin) return callback(null, true); - if (ALLOWED_ORIGINS.includes(origin)) return callback(null, true); + if (isAllowedOrigin(origin)) return callback(null, true); callback(new Error(`CORS: origin ${origin} not allowed`)); }, credentials: true, diff --git a/apps/dashboard/Dockerfile b/apps/dashboard/Dockerfile index b9a24a0..c157e8f 100644 --- a/apps/dashboard/Dockerfile +++ b/apps/dashboard/Dockerfile @@ -8,10 +8,21 @@ ARG VITE_AUTH0_CLIENT_ID ARG VITE_AUTH0_AUDIENCE ARG VITE_BFF_URL ARG VITE_GATEWAY_URL +ARG VITE_ALLOW_INSECURE_AUTH +ARG VITE_INSECURE_TENANT_ID +ENV VITE_AUTH0_DOMAIN=$VITE_AUTH0_DOMAIN +ENV VITE_AUTH0_CLIENT_ID=$VITE_AUTH0_CLIENT_ID +ENV VITE_AUTH0_AUDIENCE=$VITE_AUTH0_AUDIENCE +ENV VITE_BFF_URL=$VITE_BFF_URL +ENV VITE_GATEWAY_URL=$VITE_GATEWAY_URL +ENV VITE_ALLOW_INSECURE_AUTH=$VITE_ALLOW_INSECURE_AUTH +ENV VITE_INSECURE_TENANT_ID=$VITE_INSECURE_TENANT_ID RUN npm run build -FROM nginx:alpine -COPY --from=builder /app/dist /usr/share/nginx/html -COPY apps/dashboard/nginx.conf /etc/nginx/conf.d/default.conf -EXPOSE 80 -CMD ["nginx", "-g", "daemon off;"] +FROM node:20-alpine +WORKDIR /app +RUN npm install -g serve +COPY --from=builder /app/dist ./dist +USER node +EXPOSE 8080 +CMD ["serve", "-s", "dist", "-l", "8080"] diff --git a/apps/dashboard/src/features/billing/BillingPage.tsx b/apps/dashboard/src/features/billing/BillingPage.tsx index 38c6602..98f5ecf 100644 --- a/apps/dashboard/src/features/billing/BillingPage.tsx +++ b/apps/dashboard/src/features/billing/BillingPage.tsx @@ -236,25 +236,32 @@ export function BillingPage() { ))} - + {plan.key === "enterprise" && !isCurrent ? ( + + Contact Sales + + ) : ( + + )} ); })} diff --git a/apps/dashboard/src/features/devices/components/RegisterDeviceModal.tsx b/apps/dashboard/src/features/devices/components/RegisterDeviceModal.tsx index aadddf6..5cb976e 100644 --- a/apps/dashboard/src/features/devices/components/RegisterDeviceModal.tsx +++ b/apps/dashboard/src/features/devices/components/RegisterDeviceModal.tsx @@ -33,6 +33,14 @@ function RegisterDeviceModalContent({ onClose, onRegistered }: Omit window.clearTimeout(focusTimer); }, [reset]); + useEffect(() => { + const onWindowKeyDown = (event: KeyboardEvent) => { + if (event.key === "Escape") onClose(); + }; + window.addEventListener("keydown", onWindowKeyDown); + return () => window.removeEventListener("keydown", onWindowKeyDown); + }, [onClose]); + const validate = (value: string): string | null => { if (!value.trim()) return "Serial number is required"; if (!SERIAL_REGEX.test(value.trim())) @@ -100,8 +108,9 @@ function RegisterDeviceModalContent({ onClose, onRegistered }: Omit { - setSerial(e.target.value); - setValidationError(null); + const nextSerial = e.target.value.toUpperCase(); + setSerial(nextSerial); + setValidationError(nextSerial.trim() ? validate(nextSerial) : null); }} placeholder="e.g. GG-SILO-001" className="w-full px-3 py-2 border rounded-lg text-sm bg-white dark:bg-gray-800 text-gray-900 dark:text-white placeholder-gray-400 focus:outline-none focus:ring-2 focus:ring-green-500 border-gray-300 dark:border-gray-700" @@ -110,7 +119,10 @@ function RegisterDeviceModalContent({ onClose, onRegistered }: Omit {(validationError || error) && ( -

+

{validationError || error}

)} @@ -130,7 +142,7 @@ function RegisterDeviceModalContent({ onClose, onRegistered }: Omit