diff --git a/platform/README.md b/platform/README.md new file mode 100644 index 00000000000..5d8538e345e --- /dev/null +++ b/platform/README.md @@ -0,0 +1,112 @@ +# Platform Infrastructure + +Terraform configuration for deploying Vellum Assistant to GKE. + +## Architecture + +``` + ┌─────────────────┐ + │ assistant. │ + │ vellum.ai │ + └────────┬────────┘ + │ + ┌────────▼────────┐ + │ Cloud Load │ + │ Balancer │ + │ (Static IP) │ + └────────┬────────┘ + │ + ┌────────▼────────┐ + │ GKE Ingress │ + │ (SSL/TLS) │ + └────────┬────────┘ + │ + ┌──────────────┼──────────────┐ + │ │ │ + ┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐ + │ Pod 1 │ │ Pod 2 │ │ Pod N │ + │ Next.js │ │ Next.js │ │ Next.js │ + └───────────┘ └───────────┘ └───────────┘ +``` + +## Prerequisites + +1. GCP project with billing enabled +2. `gcloud` CLI authenticated +3. Terraform >= 1.0 +4. Existing GKE cluster (or set `create_cluster = true`) + +## Quick Start + +```bash +cd terraform + +# Copy and edit variables +cp terraform.tfvars.example terraform.tfvars + +# Set sensitive variables via environment +export TF_VAR_database_url="postgresql://..." +export TF_VAR_anthropic_api_key="sk-ant-..." + +# Initialize Terraform +terraform init + +# Plan changes +terraform plan + +# Apply +terraform apply +``` + +## DNS Setup + +After applying, Terraform outputs the static IP. Create an A record: + +``` +assistant.vellum.ai -> +``` + +The managed SSL certificate will auto-provision once DNS propagates. + +## Building the Docker Image + +```bash +cd ../web + +# Build +docker build -t gcr.io/PROJECT_ID/vellum-assistant:latest . + +# Push +docker push gcr.io/PROJECT_ID/vellum-assistant:latest +``` + +## Files + +- `main.tf` - GCP provider, GKE cluster, static IP, SSL cert +- `k8s.tf` - Kubernetes deployment, service, ingress +- `variables.tf` - Input variables +- `outputs.tf` - Useful outputs +- `terraform.tfvars.example` - Example configuration + +## Using Existing Cluster + +If you have an existing GKE cluster: + +```hcl +create_cluster = false +cluster_name = "your-existing-cluster" +``` + +The Terraform will deploy the app to the existing cluster. + +## Creating New Cluster + +```hcl +create_cluster = true +cluster_name = "vellum-assistant" +``` + +This creates a private GKE cluster with: +- Workload Identity enabled +- Autoscaling node pool (1-3 nodes) +- Network policy enabled diff --git a/platform/terraform/k8s.tf b/platform/terraform/k8s.tf new file mode 100644 index 00000000000..9b5f5a30ecd --- /dev/null +++ b/platform/terraform/k8s.tf @@ -0,0 +1,184 @@ +# Kubernetes Namespace +resource "kubernetes_namespace" "vellum_assistant" { + metadata { + name = "vellum-assistant" + + labels = { + app = "vellum-assistant" + env = var.environment + } + } +} + +# ConfigMap for non-sensitive config +resource "kubernetes_config_map" "app_config" { + metadata { + name = "vellum-assistant-config" + namespace = kubernetes_namespace.vellum_assistant.metadata[0].name + } + + data = { + NODE_ENV = "production" + } +} + +# Secret for sensitive values +resource "kubernetes_secret" "app_secrets" { + metadata { + name = "vellum-assistant-secrets" + namespace = kubernetes_namespace.vellum_assistant.metadata[0].name + } + + data = { + DATABASE_URL = var.database_url + ANTHROPIC_API_KEY = var.anthropic_api_key + } + + type = "Opaque" +} + +# Deployment +resource "kubernetes_deployment" "app" { + metadata { + name = "vellum-assistant" + namespace = kubernetes_namespace.vellum_assistant.metadata[0].name + + labels = { + app = "vellum-assistant" + } + } + + spec { + replicas = var.app_replicas + + selector { + match_labels = { + app = "vellum-assistant" + } + } + + template { + metadata { + labels = { + app = "vellum-assistant" + } + } + + spec { + container { + name = "web" + image = var.app_image + + port { + container_port = 3000 + } + + env_from { + config_map_ref { + name = kubernetes_config_map.app_config.metadata[0].name + } + } + + env_from { + secret_ref { + name = kubernetes_secret.app_secrets.metadata[0].name + } + } + + resources { + requests = { + cpu = "100m" + memory = "256Mi" + } + limits = { + cpu = "500m" + memory = "512Mi" + } + } + + liveness_probe { + http_get { + path = "/api/health" + port = 3000 + } + initial_delay_seconds = 30 + period_seconds = 10 + } + + readiness_probe { + http_get { + path = "/api/health" + port = 3000 + } + initial_delay_seconds = 5 + period_seconds = 5 + } + } + } + } + } +} + +# Service +resource "kubernetes_service" "app" { + metadata { + name = "vellum-assistant" + namespace = kubernetes_namespace.vellum_assistant.metadata[0].name + + annotations = { + "cloud.google.com/neg" = jsonencode({ + ingress = true + }) + } + } + + spec { + selector = { + app = "vellum-assistant" + } + + port { + port = 80 + target_port = 3000 + } + + type = "ClusterIP" + } +} + +# Ingress with Google-managed SSL +resource "kubernetes_ingress_v1" "app" { + metadata { + name = "vellum-assistant" + namespace = kubernetes_namespace.vellum_assistant.metadata[0].name + + annotations = { + "kubernetes.io/ingress.class" = "gce" + "kubernetes.io/ingress.global-static-ip-name" = google_compute_global_address.ingress_ip.name + "ingress.gcp.kubernetes.io/pre-shared-cert" = google_compute_managed_ssl_certificate.default.name + "kubernetes.io/ingress.allow-http" = "false" + } + } + + spec { + rule { + host = var.domain + + http { + path { + path = "/*" + path_type = "ImplementationSpecific" + + backend { + service { + name = kubernetes_service.app.metadata[0].name + port { + number = 80 + } + } + } + } + } + } + } +} diff --git a/platform/terraform/main.tf b/platform/terraform/main.tf new file mode 100644 index 00000000000..b98336d5c70 --- /dev/null +++ b/platform/terraform/main.tf @@ -0,0 +1,138 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.25" + } + } + + # TODO: Configure remote state backend + # backend "gcs" { + # bucket = "vellum-terraform-state" + # prefix = "vellum-assistant" + # } +} + +provider "google" { + project = var.project_id + region = var.region +} + +# GKE Cluster +resource "google_container_cluster" "main" { + count = var.create_cluster ? 1 : 0 + + name = var.cluster_name + location = var.region + + # We can't create a cluster with no node pool, but we want to use + # separately managed node pools. So we create the smallest possible + # default pool and immediately delete it. + remove_default_node_pool = true + initial_node_count = 1 + + # Enable Workload Identity + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" + } + + # Network config + network = var.network + subnetwork = var.subnetwork + + # Enable network policy + network_policy { + enabled = true + } + + # Private cluster config + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = false + master_ipv4_cidr_block = "172.16.0.0/28" + } + + # Master authorized networks + master_authorized_networks_config { + cidr_blocks { + cidr_block = "0.0.0.0/0" + display_name = "All" + } + } +} + +# Node Pool +resource "google_container_node_pool" "primary" { + count = var.create_cluster ? 1 : 0 + + name = "primary-pool" + location = var.region + cluster = google_container_cluster.main[0].name + node_count = var.node_count + + node_config { + machine_type = var.machine_type + disk_size_gb = 50 + disk_type = "pd-standard" + + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + workload_metadata_config { + mode = "GKE_METADATA" + } + + labels = { + env = var.environment + } + + tags = ["gke-node", var.cluster_name] + } + + autoscaling { + min_node_count = 1 + max_node_count = var.max_node_count + } + + management { + auto_repair = true + auto_upgrade = true + } +} + +# Get cluster credentials for kubernetes provider +data "google_container_cluster" "main" { + name = var.create_cluster ? google_container_cluster.main[0].name : var.cluster_name + location = var.region + + depends_on = [google_container_cluster.main] +} + +data "google_client_config" "default" {} + +provider "kubernetes" { + host = "https://${data.google_container_cluster.main.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.main.master_auth[0].cluster_ca_certificate) +} + +# Static IP for Ingress +resource "google_compute_global_address" "ingress_ip" { + name = "vellum-assistant-ip" +} + +# Managed SSL Certificate +resource "google_compute_managed_ssl_certificate" "default" { + name = "vellum-assistant-cert" + + managed { + domains = [var.domain] + } +} diff --git a/platform/terraform/outputs.tf b/platform/terraform/outputs.tf new file mode 100644 index 00000000000..47b40bf47f9 --- /dev/null +++ b/platform/terraform/outputs.tf @@ -0,0 +1,30 @@ +output "cluster_name" { + description = "GKE cluster name" + value = var.create_cluster ? google_container_cluster.main[0].name : var.cluster_name +} + +output "cluster_endpoint" { + description = "GKE cluster endpoint" + value = data.google_container_cluster.main.endpoint + sensitive = true +} + +output "ingress_ip" { + description = "Static IP address for the ingress" + value = google_compute_global_address.ingress_ip.address +} + +output "domain" { + description = "Application domain" + value = var.domain +} + +output "dns_record" { + description = "DNS A record to create" + value = "Create an A record: ${var.domain} -> ${google_compute_global_address.ingress_ip.address}" +} + +output "namespace" { + description = "Kubernetes namespace" + value = kubernetes_namespace.vellum_assistant.metadata[0].name +} diff --git a/platform/terraform/terraform.tfvars.example b/platform/terraform/terraform.tfvars.example new file mode 100644 index 00000000000..1203893fdfe --- /dev/null +++ b/platform/terraform/terraform.tfvars.example @@ -0,0 +1,18 @@ +# Copy to terraform.tfvars and fill in values + +project_id = "your-gcp-project-id" +region = "us-central1" + +# Set to true to create a new GKE cluster +# Set to false to use an existing cluster +create_cluster = false +cluster_name = "existing-cluster-name" + +# Application +domain = "assistant.vellum.ai" +app_image = "gcr.io/your-project/vellum-assistant:latest" +app_replicas = 2 + +# Secrets (use environment variables or secret management) +# TF_VAR_database_url="postgresql://..." +# TF_VAR_anthropic_api_key="sk-ant-..." diff --git a/platform/terraform/variables.tf b/platform/terraform/variables.tf new file mode 100644 index 00000000000..833d08d96bc --- /dev/null +++ b/platform/terraform/variables.tf @@ -0,0 +1,90 @@ +variable "project_id" { + description = "GCP Project ID" + type = string +} + +variable "region" { + description = "GCP region" + type = string + default = "us-central1" +} + +variable "environment" { + description = "Environment name (dev, staging, prod)" + type = string + default = "prod" +} + +# Cluster settings +variable "create_cluster" { + description = "Whether to create a new GKE cluster or use existing" + type = bool + default = false +} + +variable "cluster_name" { + description = "GKE cluster name" + type = string + default = "vellum-assistant" +} + +variable "network" { + description = "VPC network name" + type = string + default = "default" +} + +variable "subnetwork" { + description = "VPC subnetwork name" + type = string + default = "default" +} + +variable "machine_type" { + description = "Node machine type" + type = string + default = "e2-medium" +} + +variable "node_count" { + description = "Initial number of nodes per zone" + type = number + default = 1 +} + +variable "max_node_count" { + description = "Maximum number of nodes per zone for autoscaling" + type = number + default = 3 +} + +# Application settings +variable "domain" { + description = "Domain for the application" + type = string + default = "assistant.vellum.ai" +} + +variable "app_image" { + description = "Docker image for the Next.js app" + type = string + default = "gcr.io/PROJECT_ID/vellum-assistant:latest" +} + +variable "app_replicas" { + description = "Number of app replicas" + type = number + default = 2 +} + +variable "database_url" { + description = "PostgreSQL connection string" + type = string + sensitive = true +} + +variable "anthropic_api_key" { + description = "Anthropic API key" + type = string + sensitive = true +} diff --git a/web/.dockerignore b/web/.dockerignore new file mode 100644 index 00000000000..c16b34b2c41 --- /dev/null +++ b/web/.dockerignore @@ -0,0 +1,7 @@ +node_modules +.next +.git +*.md +.env* +.dockerignore +Dockerfile diff --git a/web/Dockerfile b/web/Dockerfile new file mode 100644 index 00000000000..b6713b2704c --- /dev/null +++ b/web/Dockerfile @@ -0,0 +1,41 @@ +# Build stage +FROM node:20-alpine AS builder + +WORKDIR /app + +# Install dependencies +COPY package*.json ./ +RUN npm ci + +# Copy source +COPY . . + +# Build +ENV NEXT_TELEMETRY_DISABLED=1 +RUN npm run build + +# Production stage +FROM node:20-alpine AS runner + +WORKDIR /app + +ENV NODE_ENV=production +ENV NEXT_TELEMETRY_DISABLED=1 + +# Create non-root user +RUN addgroup --system --gid 1001 nodejs +RUN adduser --system --uid 1001 nextjs + +# Copy built assets +COPY --from=builder /app/public ./public +COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./ +COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static + +USER nextjs + +EXPOSE 3000 + +ENV PORT=3000 +ENV HOSTNAME="0.0.0.0" + +CMD ["node", "server.js"] diff --git a/web/next.config.ts b/web/next.config.ts index 3bf19cf2bd3..68a6c64d279 100644 --- a/web/next.config.ts +++ b/web/next.config.ts @@ -1,9 +1,7 @@ import type { NextConfig } from "next"; const nextConfig: NextConfig = { - outputFileTracingIncludes: { - "/api/agents": ["./agent-templates/**/*"], - }, + output: "standalone", }; export default nextConfig; diff --git a/web/src/app/api/health/route.ts b/web/src/app/api/health/route.ts new file mode 100644 index 00000000000..95dfe595044 --- /dev/null +++ b/web/src/app/api/health/route.ts @@ -0,0 +1,8 @@ +import { NextResponse } from "next/server"; + +export async function GET() { + return NextResponse.json({ + status: "healthy", + timestamp: new Date().toISOString(), + }); +}