pahuldeepp · pahuldeepp · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -40,11 +40,11 @@ jobs:
         run: npm run build
         working-directory: apps/dashboard
         env:
-          VITE_AUTH0_DOMAIN:    ${{ secrets.VITE_AUTH0_DOMAIN }}
-          VITE_AUTH0_CLIENT_ID: ${{ secrets.VITE_AUTH0_CLIENT_ID }}
-          VITE_AUTH0_AUDIENCE:  ${{ secrets.VITE_AUTH0_AUDIENCE }}
-          VITE_BFF_URL:         ${{ secrets.E2E_BFF_URL }}
-          VITE_GATEWAY_URL:     ${{ secrets.E2E_GATEWAY_URL }}
+          VITE_AUTH0_DOMAIN:    e2e.auth0.local
+          VITE_AUTH0_CLIENT_ID: e2e-client-id
+          VITE_AUTH0_AUDIENCE:  https://api.grainguard.test
+          VITE_BFF_URL:         http://localhost:4000/graphql
+          VITE_GATEWAY_URL:     http://localhost:3000
 
       - name: Serve dashboard
         run: npx serve -s dist -l 5173 &
@@ -58,8 +58,8 @@ jobs:
         working-directory: tests/e2e
         env:
           E2E_BASE_URL:         http://localhost:5173
-          VITE_AUTH0_CLIENT_ID: ${{ secrets.VITE_AUTH0_CLIENT_ID }}
-          VITE_AUTH0_AUDIENCE:  ${{ secrets.VITE_AUTH0_AUDIENCE }}
+          VITE_AUTH0_CLIENT_ID: e2e-client-id
+          VITE_AUTH0_AUDIENCE:  https://api.grainguard.test
 
       - name: Upload Playwright report
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml
@@ -69,6 +69,8 @@ jobs:
         working-directory: apps/gateway
         env:
           PORT: 3000
+          BFF_HOST: localhost
+          BFF_PORT: "4000"
           NODE_ENV: test
           AUTH_ENABLED: "false"
           DATABASE_URL: postgres://grainguard:grainguard@localhost:5432/grainguard
@@ -168,8 +170,17 @@ jobs:
             --env BFF_URL=http://localhost:4000 \
             --env GATEWAY_AUTH_DISABLED=true \
             --env BFF_AUTH_DISABLED=true \
-            --env JWT=dummy-jwt \
+            --env GATEWAY_SAMPLE_PATH=/health \
             --env TEST_DEVICE_ID=00000000-0000-0000-0000-000000000001 \
+            --env BASELINE_RATE=20 \
+            --env BASELINE_DURATION=30s \
+            --env BASELINE_PREALLOCATED_VUS=20 \
+            --env BASELINE_MAX_VUS=40 \
+            --env SPIKE_TARGET=40 \
+            --env SPIKE_RAMP_UP=15s \
+            --env SPIKE_HOLD=15s \
+            --env SPIKE_RAMP_DOWN=15s \
+            --env THINK_TIME_SECONDS=0.05 \
             scripts/load-tests/performance-budget.js
 
       - name: Upload performance results

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 > Production-grade, polyglot microservices SaaS platform for grain and agri operations.
 
-GrainGuard ingests high-volume device telemetry, computes spoilage risk scores, triggers automated alert workflows, and ships with full multi-tenant billing, SSO, team management, audit logging, observability, CI/CD, chaos testing, SLO monitoring, and operational runbooks.
+GrainGuard ingests high-volume device telemetry, computes spoilage risk scores, triggers automated alert workflows, and ships with multi-tenant billing, SSO, team management, audit logging, observability, CI/CD, load testing, and operational runbooks.
 
 ---
 
@@ -81,6 +81,18 @@ Risk Engine (Python) ── Workflow Alerts (Node.js) ── RabbitMQ ── Job
 
 ---
 
+## Current Deployment Status
+
+| Area | State |
+|------|-------|
+| Local Docker stack | ✅ Validated end-to-end |
+| GitOps apps in repo | ✅ `dev`, `staging`, and `prod` ArgoCD apps committed |
+| Terraform environments in repo | ✅ `dev` and `staging` committed |
+| Dedicated staging environment | 🟡 Scaffold committed; deploy/validate next |
+| Production rollout strategy | 🟡 Safe rolling deploys now; canary planned for production |
+
+---
+
 ## SaaS Features
 
 | Feature | Status |
@@ -172,18 +184,22 @@ go run tools/publish-telemetry/main.go
 # Go unit + integration tests
 go test -race -count=1 ./...
 
+# Go lint
+go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.11.0 run --timeout=5m
+
 # k6 load tests (requires running stack)
 k6 run tests/load/spike.js
 k6 run tests/load/soak.js
 k6 run tests/load/stress.js
 
-# Chaos tests (requires kubectl + live cluster)
-bash tests/chaos/run-all.sh
-
 # Replay + idempotency test
 ./scripts/replay/replay_test.sh
 ```
 
+Note:
+- The core load-test scripts above are committed in `tests/load/`.
+- Cluster-level chaos automation is not currently committed on `master`; add or restore it before relying on README-driven chaos drills.
-# k6 load tests (requires running stack)
-k6 run tests/load/spike.js
-k6 run tests/load/soak.js
-k6 run tests/load/stress.js
-
-# Chaos tests (requires kubectl + live cluster)
-bash tests/chaos/run-all.sh
-
-# Replay + idempotency test
-./scripts/replay/replay_test.sh
-```
-
-Note:
- The core load-test scripts above are committed in `tests/load/`.
- Cluster-level chaos automation is not currently committed on `master`; add or restore it before relying on README-driven chaos drills.
+# k6 load tests (requires running stack)
+k6 run scripts/load-tests/graphql-stress.js
+k6 run scripts/load-tests/ingest-stress.js
+k6 run scripts/load-tests/mixed-stack-stress.js
+
+# Replay + idempotency test
+./scripts/replay/replay_test.sh
-# k6 load tests (requires running stack)
-k6 run tests/load/spike.js
-k6 run tests/load/soak.js
-k6 run tests/load/stress.js
-
-# Chaos tests (requires kubectl + live cluster)
-bash tests/chaos/run-all.sh
-
-# Replay + idempotency test
-./scripts/replay/replay_test.sh
-```
-
-Note:
- The core load-test scripts above are committed in `tests/load/`.
- Cluster-level chaos automation is not currently committed on `master`; add or restore it before relying on README-driven chaos drills.
+# k6 load tests (requires running stack)
+k6 run scripts/load-tests/graphql-stress.js
+k6 run scripts/load-tests/ingest-stress.js
+k6 run scripts/load-tests/mixed-stack-stress.js
+
+# Replay + idempotency test
+./scripts/replay/replay_test.sh
+
 ## Code Review Automation
 
 This repository is preconfigured for CodeRabbit via [`/.coderabbit.yaml`](./.coderabbit.yaml).
@@ -215,35 +231,15 @@ Notes:
 
 ---
 
-## Chaos Testing
-
-Five experiments covering the critical failure modes:
-
-| Experiment | What it kills | Pass condition |
-|------------|--------------|----------------|
-| `pod-kill` | gateway, bff, telemetry-service pods | Respawns within 30s |
-| `kafka-consumer-pause` | read-model-builder + cdc-transformer | Lag ≤ 10 000 within 5 min |
-| `redis-outage` | Redis | BFF falls back to DB, no panics |
-| `projection-lag` | read-model-builder | Alert fires, lag recovers in 5 min |
-| `network-partition` | telemetry-service → Kafka egress | Messages buffered, delivered after heal |
-
-```bash
-# Run all experiments
-bash tests/chaos/run-all.sh
-
-# Or trigger via GitHub Actions (manual dispatch)
-# .github/workflows/chaos.yml — also runs weekly on Saturdays
-```
-
----
-
 ## Operational Runbooks
 
 | Runbook | Trigger |
 |---------|---------|
+| [Postgres Backup / Restore](docs/runbooks/postgres-backup-restore.md) | Backup verification, restore drill, data recovery |
 | [Postgres Failover](docs/runbooks/postgres-failover.md) | Primary down, replica lag high |
 | [Kafka Loss](docs/runbooks/kafka-loss.md) | Broker down, under-replicated partitions |
 | [DLQ Spike](docs/runbooks/dlq-spike.md) | `DLQMessagesAccumulating` alert |
+| [Redis Backup / Restore](docs/runbooks/redis-backup-restore.md) | Cache restore drill, persistence recovery |
 | [Redis Failover](docs/runbooks/redis-failover.md) | Cache miss 100%, lock timeouts |
 | [Projection Lag](docs/runbooks/projection-lag.md) | `ProjectionLagHigh` alert |
 | [gRPC Outage](docs/runbooks/grpc-outage.md) | Circuit breaker open, 503 upstream |
@@ -261,6 +257,8 @@ terraform apply -var="db_password=yourpassword"
 
 Provisions: VPC · EKS · RDS Postgres · Elasticache Redis · MSK Kafka · DynamoDB · ECR · Secrets Manager
 
+Today, `dev` and `staging` Terraform environments are committed in-repo. The next step is to deploy and validate `staging` before treating the rollout path as production-ready.
+
 ---
 
 ## Kubernetes (GitOps)
@@ -278,6 +276,14 @@ helm diff upgrade grainguard k8s/helm/grainguard \
 
 ArgoCD watches `k8s/argocd/apps/` and auto-syncs on every push to master.
 
+Committed applications today:
+- `grainguard-dev` -> `grainguard-dev`
+- `grainguard-staging` -> `grainguard-staging`
+- `grainguard-prod` -> `grainguard-prod`
+
+Recommended next environment:
+- `grainguard-staging` -> deploy and validate ingress, TLS, DNS, secrets, restore drills, and production-like auth/billing flows before first prod rollout
+
 ---
 
 ## Architecture Decision Records
@@ -303,21 +309,26 @@ ArgoCD watches `k8s/argocd/apps/` and auto-syncs on every push to master.
 |-------|------|--------|
 | R1 — Core loop | Ingest, CQRS, outbox, saga | ✅ Done |
 | R2 — CDC + Search | Debezium, Elasticsearch, RabbitMQ | ✅ Done |
-| R3 — Reliability | Helm, ArgoCD, k6 load tests, chaos tests | ✅ Done |
+| R3 — Reliability baseline | Helm, ArgoCD scaffolding, k6 load tests, runbooks | ✅ Done |
 | R4 — Observability | SLOs, burn-rate alerts, Grafana dashboard, runbooks | ✅ Done |
 | R5 — Security | CSRF, rate limiting, audit logging, RBAC, API keys | ✅ Done |
 | R6 — SaaS billing | Stripe, tenant onboarding, team management, SSO, webhooks | ✅ Done |
-| R7 — DB migrations | Flyway/Knex migration framework, schema versioning | 🔜 Next |
-| R8 — Secret management | HashiCorp Vault / AWS Secrets Manager integration | 🔜 Planned |
+| R7 — Staging environment | Dedicated Argo app, Terraform env, deployed validation | 🟡 Scaffolded |
+| R8 — Production hardening | Canary rollout, restore proof, deployed auth/webhook validation | 🔜 Next |
 
 ---
 
-## Load test results
+## Latest Local Validation
+
+Latest mixed read/write validation on `master` (local Docker stack):
 
-- Kafka ingest: **1,700 events/sec**
-- Gateway p95 latency: **5.89ms**
-- Read model builder: **2,500–3,000 events/sec** sustained
+- **35,077** total requests
+- **438 req/s** aggregate throughput
+- **0%** HTTP failure rate
+- Gateway GraphQL p95: **11.5 ms**
+- Ingest p95: **10.8 ms**
+- Kafka consumer groups drained back to **0 lag** after the run
 
 ---
 
-*Built to demonstrate end-to-end DDIA patterns, distributed systems, GitOps, SRE practices, and production multi-tenant SaaS architecture.*
+*Built to demonstrate end-to-end DDIA patterns, distributed systems, GitOps, SRE practices, and production-style multi-tenant SaaS architecture.*
diff --git a/apps/bff/src/datasources/postgres.ts b/apps/bff/src/datasources/postgres.ts
@@ -4,17 +4,38 @@ import { cache } from "./redis";
 
 const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
 
+function shouldUseInsecureTls(connectionString: string | undefined): boolean {
+  if (!connectionString) return false;
+  return /sslmode=(require|verify-ca|verify-full)/i.test(connectionString);
+}
+
+function buildPoolOptions() {
+  const connectionString =
+    process.env.READ_DATABASE_URL ||
+    `postgres://${process.env.READ_DB_USER ?? "postgres"}:${process.env.READ_DB_PASSWORD ?? "postgres"}@${process.env.READ_DB_HOST ?? "postgres-read"}:${process.env.READ_DB_PORT ?? "5432"}/${process.env.READ_DB_NAME ?? "grainguard_read"}`;
+
+  const rejectUnauthorized =
+    process.env.READ_DB_SSL_REJECT_UNAUTHORIZED !== "false";
+
+  return {
+    connectionString,
+    max: 50,
+    ...(shouldUseInsecureTls(connectionString)
+      ? {
+          ssl: {
+            rejectUnauthorized,
+          },
+        }
+      : {}),
+  };
+}
+
 /** Returns true for valid UUID v4 strings — guards against bad JWT claims */
 export function isValidUuid(value: unknown): value is string {
   return typeof value === "string" && UUID_RE.test(value);
 }
 
-const pool = new Pool({
-  connectionString:
-    process.env.READ_DATABASE_URL ||
-    `postgres://${process.env.READ_DB_USER ?? "postgres"}:${process.env.READ_DB_PASSWORD ?? "postgres"}@${process.env.READ_DB_HOST ?? "postgres-read"}:${process.env.READ_DB_PORT ?? "5432"}/${process.env.READ_DB_NAME ?? "grainguard_read"}`,
-  max: 50,
-});
+const pool = new Pool(buildPoolOptions());
 
 type Row = Record<string, unknown>;
 type QueryResult = import("pg").QueryResult<Row>;

diff --git a/apps/bff/src/datasources/redis.ts b/apps/bff/src/datasources/redis.ts
@@ -3,6 +3,7 @@ import { createClient, createCluster } from "redis";
 // REDIS_CLUSTER_NODES = "redis-cluster-0:6379,redis-cluster-1:6379,..."
 // When set, uses Redis Cluster. Otherwise falls back to single-node (local dev).
 const REDIS_CLUSTER_NODES = process.env.REDIS_CLUSTER_NODES;
+const REDIS_PASSWORD = process.env.REDIS_PASSWORD;
 
 const client = (() => {
   if (REDIS_CLUSTER_NODES) {
@@ -13,7 +14,10 @@ const client = (() => {
       };
     });
     console.log(`Redis cluster mode: ${rootNodes.length} nodes`);
-    return createCluster({ rootNodes });
+    return createCluster({
+      rootNodes,
+      defaults: REDIS_PASSWORD ? { password: REDIS_PASSWORD } : undefined,
+    });
   }
 
   // Single-node (local dev / docker-compose default)
@@ -23,6 +27,7 @@ const client = (() => {
       host: process.env.REDIS_HOST || "localhost",
       port: parseInt(process.env.REDIS_PORT || "6379", 10),
     },
+    password: REDIS_PASSWORD || undefined,
   });
 })();
 

diff --git a/apps/bff/src/server.ts b/apps/bff/src/server.ts
@@ -22,13 +22,27 @@ const ISSUER = process.env.JWT_ISSUER!;
 const AUDIENCE = process.env.JWT_AUDIENCE!;
 const ALLOWED_ORIGINS =
   (process.env.ALLOWED_ORIGINS ||
-    "http://localhost:5173,http://localhost:5174,http://localhost:8086").split(",");
+    "http://localhost:5173,http://localhost:5174,http://localhost:8086")
+    .split(",")
+    .map((origin) => origin.trim());
 if (!JWKS_URL || !ISSUER || !AUDIENCE) {
   throw new Error("JWKS_URL, JWT_ISSUER, JWT_AUDIENCE must be set");
 }
 
 const jwks = createRemoteJWKSet(new URL(JWKS_URL));
 
+function isAllowedOrigin(origin: string): boolean {
+  return ALLOWED_ORIGINS.some((allowedOrigin) => {
+    if (allowedOrigin === origin) return true;
+    if (!allowedOrigin.includes("*")) return false;
+
+    const pattern = new RegExp(
+      `^${allowedOrigin.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*")}$`
+    );
+    return pattern.test(origin);
+  });
+}
+
 async function verifyToken(token: string) {
   const { payload } = await jwtVerify(token, jwks, {
     issuer: ISSUER,
@@ -133,7 +147,7 @@ async function startServer() {
     cors<cors.CorsRequest>({
       origin: (origin, callback) => {
         if (!origin) return callback(null, true);
-        if (ALLOWED_ORIGINS.includes(origin)) return callback(null, true);
+        if (isAllowedOrigin(origin)) return callback(null, true);
         callback(new Error(`CORS: origin ${origin} not allowed`));
       },
       credentials: true,

diff --git a/apps/dashboard/Dockerfile b/apps/dashboard/Dockerfile
@@ -8,10 +8,21 @@ ARG VITE_AUTH0_CLIENT_ID
 ARG VITE_AUTH0_AUDIENCE
 ARG VITE_BFF_URL
 ARG VITE_GATEWAY_URL
+ARG VITE_ALLOW_INSECURE_AUTH
+ARG VITE_INSECURE_TENANT_ID
+ENV VITE_AUTH0_DOMAIN=$VITE_AUTH0_DOMAIN
+ENV VITE_AUTH0_CLIENT_ID=$VITE_AUTH0_CLIENT_ID
+ENV VITE_AUTH0_AUDIENCE=$VITE_AUTH0_AUDIENCE
+ENV VITE_BFF_URL=$VITE_BFF_URL
+ENV VITE_GATEWAY_URL=$VITE_GATEWAY_URL
+ENV VITE_ALLOW_INSECURE_AUTH=$VITE_ALLOW_INSECURE_AUTH
+ENV VITE_INSECURE_TENANT_ID=$VITE_INSECURE_TENANT_ID
 RUN npm run build
 
-FROM nginx:alpine
-COPY --from=builder /app/dist /usr/share/nginx/html
-COPY apps/dashboard/nginx.conf /etc/nginx/conf.d/default.conf
-EXPOSE 80
-CMD ["nginx", "-g", "daemon off;"]
+FROM node:20-alpine
+WORKDIR /app
+RUN npm install -g serve
+COPY --from=builder /app/dist ./dist
+USER node
+EXPOSE 8080
+CMD ["serve", "-s", "dist", "-l", "8080"]
diff --git a/apps/dashboard/src/features/billing/BillingPage.tsx b/apps/dashboard/src/features/billing/BillingPage.tsx
@@ -236,25 +236,32 @@ export function BillingPage() {
                   </li>
                 ))}
               </ul>
-              <button
-                onClick={() => handleUpgrade(plan.key)}
-                disabled={loading === plan.key || isCurrent}
-                className={`w-full py-2 px-4 rounded-lg text-sm font-medium transition-colors disabled:opacity-50 disabled:cursor-not-allowed ${
-                  isCurrent
-                    ? "bg-gray-100 dark:bg-gray-800 text-gray-400 dark:text-gray-500 cursor-default"
-                    : plan.highlighted
-                    ? "bg-green-600 text-white hover:bg-green-700"
-                    : "bg-gray-100 dark:bg-gray-800 text-gray-900 dark:text-white hover:bg-gray-200 dark:hover:bg-gray-700"
-                }`}
-              >
-                {loading === plan.key
-                  ? "Redirecting..."
-                  : isCurrent
-                  ? "Current plan"
-                  : plan.key === "enterprise"
-                  ? "Contact Sales"
-                  : "Upgrade"}
-              </button>
+              {plan.key === "enterprise" && !isCurrent ? (
+                <a
+                  href="mailto:sales@grainguard.com?subject=Enterprise Plan"
+                  className="block w-full py-2 px-4 rounded-lg text-sm font-medium text-center transition-colors bg-gray-100 dark:bg-gray-800 text-gray-900 dark:text-white hover:bg-gray-200 dark:hover:bg-gray-700"
+                >
+                  Contact Sales
+                </a>
+              ) : (
+                <button
+                  onClick={() => handleUpgrade(plan.key)}
+                  disabled={loading === plan.key || isCurrent}
+                  className={`w-full py-2 px-4 rounded-lg text-sm font-medium transition-colors disabled:opacity-50 disabled:cursor-not-allowed ${
+                    isCurrent
+                      ? "bg-gray-100 dark:bg-gray-800 text-gray-400 dark:text-gray-500 cursor-default"
+                      : plan.highlighted
+                      ? "bg-green-600 text-white hover:bg-green-700"
+                      : "bg-gray-100 dark:bg-gray-800 text-gray-900 dark:text-white hover:bg-gray-200 dark:hover:bg-gray-700"
+                  }`}
+                >
+                  {loading === plan.key
+                    ? "Redirecting..."
+                    : isCurrent
+                    ? "Current plan"
+                    : "Upgrade"}
+                </button>
+              )}
             </div>
           );
         })}