diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index a97049e760..0000000000 --- a/CLAUDE.md +++ /dev/null @@ -1,631 +0,0 @@ -# CLAUDE.md - - - - - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - - - - - -## Commands - - - - - -### Development Setup - - - -```bash - - - -# Install dependencies - - - -pnpm install - - - - - -# Start local development environment (includes databases) - - - -make up - - - - - -# Run development servers - - - -pnpm dev - - - -``` - - - - - -### Build and Testing - - - -```bash - - - -# Build all packages - - - -pnpm build - - - - - -# Format and lint code - - - -pnpm fmt - - - - - -# Run tests (with concurrency control) - - - -pnpm test - - - - - -# Run integration tests (requires local environment) - - - -make integration - - - - - -# Type checking - - - -pnpm typecheck # Check individual app package.json for specific commands - - - -``` - - - - - -### Database Operations - - - -```bash - - - -# Run database migrations - - - -pnpm migrate - - - - - -# Generate database schema changes - - - -make generate-sql - - - - - -# ClickHouse migrations - - - -make migrate-clickhouse - - - -make migrate-clickhouse-reset # Reset ClickHouse schema - - - -``` - - - - - -### Go-Specific Commands - - - -```bash - - - -# Go services (in /go directory) - - - -go test ./... - - - -go build ./cmd/... - - - - - -# Deploy services (in /go/deploy directory) - - - -make build # Test binary builds - - - -make install # Build and install with systemd units - - - -``` - - - - - -## Architecture Overview - - - - - -Unkey is a monorepo containing both TypeScript/Node.js and Go services for API key management, authentication, and distributed rate limiting. - - - - - -### Core Applications - - - - - -**Dashboard** (`apps/dashboard/`) - - - -- Next.js web interface for API key management - - - -- Built with React, TailwindCSS, and tRPC - - - -- Authentication via WorkOS - - - - - -**API** (`apps/api/`) - - - -- Cloudflare Workers-based API for key verification - - - -- Uses Hono framework with OpenAPI validation - - - -- Handles key CRUD operations and rate limiting - - - - - -**Agent** (`apps/agent/`) - - - -- Go-based distributed rate limiting service - - - -- Uses Serf for clustering and gossip protocol - - - -- Implements sliding window rate limiting algorithm - - - - - -**Go Services** (`go/`) - - - -- **API**: Main HTTP API server (port 7070) - - - -- **Ctrl**: Control plane for infrastructure management (port 8080) - - - -- **Deploy services**: VM lifecycle management (metald, builderd, etc.) - - - - - -### Database Architecture - - - - - -**MySQL/PlanetScale**: Primary relational data - - - -- Tables: workspaces, apis, keys, permissions, roles, identities - - - -- Drizzle ORM with type safety - - - -- Read replica support - - - - - -**ClickHouse**: Analytics and time-series data - - - -- Verification metrics and rate limiting statistics - - - -- Schema in `internal/clickhouse/schema/` - - - - - -**Redis/Upstash**: Distributed caching and rate limiting state - - - -- Multi-tier caching strategy - - - -- Real-time rate limit counters - - - - - -### Shared Packages (`internal/`) - - - - - -Key packages for cross-app functionality: - - - -- `@unkey/db`: Database schemas and connections - - - -- `@unkey/cache`: Multi-tier caching implementation - - - -- `@unkey/encryption`: AES-GCM encryption utilities - - - -- `@unkey/keys`: Key generation and validation - - - -- `@unkey/ui`: Shared React components - - - -- `@unkey/validation`: Zod schema definitions - - - - - -## Development Guidelines - - - - - -### Code Standards - - - - - -**TypeScript/JavaScript**: - - - -- Use Biome for formatting and linting - - - -- Prefer named exports over default exports (except Next.js pages) - - - -- Follow strict TypeScript configuration - - - -- Use Zod for runtime validation - - - - - -**Go**: - - - -- Follow comprehensive documentation guidelines (see `go/GO_DOCUMENTATION_GUIDELINES.md`) - - - -- Every public function/type must be documented - - - -- Use `doc.go` files for package documentation - - - -- Prefer interfaces for testability - - - - - -### Testing Patterns - - - - - -**TypeScript**: - - - -- Vitest for unit and integration tests - - - -- Separate configs: `vitest.unit.ts`, `vitest.integration.ts` - - - -- Integration harness for API testing - - - - - -**Go**: - - - -- Table-driven tests - - - -- Integration tests with real dependencies - - - -- Test organization by HTTP status codes - - - - - -### Environment Variables - - - - - -All environment variables must follow the format: `UNKEY__VARNAME` - - - - - -### Key Patterns - - - - - -**Authentication/Authorization**: - - - -- Root keys for API access with granular permissions - - - -- Workspace-based multi-tenancy - - - -- RBAC with role inheritance - - - - - -**Rate Limiting**: - - - -- Distributed consensus for accuracy - - - -- Sliding window algorithm - - - -- Override capabilities for specific identifiers - - - - - -**Error Handling**: - - - -- Consistent error types with proper HTTP status codes - - - -- Structured error responses following OpenAPI spec - - - -- Circuit breaker patterns for external dependencies - - - - - -**Caching**: - - - -- Multi-tier strategy (Memory → Redis → Database) - - - -- Stale-while-revalidate pattern - - - -- Namespace-based cache invalidation - - - - - -## Important Files - - - - - -- `turbo.json`: Monorepo build configuration - - - -- `biome.json`: Code formatting and linting rules - - - -- `package.json`: Root package with workspace scripts - - - -- `vitest.workspace.json`: Test workspace configuration - - - -- `go/GO_DOCUMENTATION_GUIDELINES.md`: Go code documentation standards - - - -- `go/deploy/CLAUDE.md`: Additional rules for deploy services - - - - - -## Development Tips - - - - - -1. **Database Changes**: Use Drizzle migrations, not manual SQL - - - -2. **Testing**: Run integration tests locally with `make integration` - - - -3. **Go Services**: Use `AIDEV-*` comments for complex/important code - - - -4. **Performance**: Prioritize reliability over performance - - - -5. **Security**: Never commit secrets or expose sensitive data in logs - -6. **Build**: run the linter and pnpm build after all TODOs \ No newline at end of file diff --git a/go/Makefile b/go/Makefile index 1aac885fa7..ad10d9ebc5 100644 --- a/go/Makefile +++ b/go/Makefile @@ -52,6 +52,9 @@ generate: buf generate go generate ./... +generate-builder: + buf generate --path proto/deploy/builderd + test: test-unit test-unit: up diff --git a/go/deploy/CLAUDE.md b/go/deploy/CLAUDE.md deleted file mode 100644 index ad27e3a010..0000000000 --- a/go/deploy/CLAUDE.md +++ /dev/null @@ -1,45 +0,0 @@ -# Rules for AI -- **Never** delete anything from this file. -- All text, ASCII, and code files MUST end with a newline. -- Use `AIDEV-NOTE:`, `AIDEV-TODO:`, `AIDEV-BUSINESS_RULE:`, or `AIDEV-QUESTION:` (all-caps prefix) as anchor comments aimed at AI and developers. - * **Important:** Before scanning files, always first try to **grep for existing anchors** `AIDEV-*` in relevant subdirectories. - * **Update relevant anchors** when modifying associated code. - * **Do not remove `AIDEV-*`s** without explicit human instruction. -- Make sure to add relevant anchor comments, whenever a file or piece of code is: - * too complex, or - * very important, or - * confusing, or - * could have a bug -- **Never** take shortcuts. Ask the user if they want to take a shortcut. -- **Always** leave the codebase better tested, better documented, and easier to work with for the next developer. -- All environment variables **MUST** follow the format UNKEY__VARNAME -- **Always** prioritize reliability over performance. -- **Never** use `go build` for any of the `assetmanagerd`, `billaged`, `builderd`, `metald` binaries. -- Use `make build` to test that the binary builds. -- Use `make install` to build and install the binary w/systemd unit from `$SERVICE/contrib/systemd` -- When a service's `*.go` code changes significantly, increase the patch-level version number. - -# Service folder structure - -The root implied here is `deploy/` - -- Grafana dashboards: `/contrib/grafana-dashboards` -- Systemd unit files etc: `/contrib/systemd` -- Build artifact directory: `/build` -- Documentation: `/docs` -- Service-level makefile: `/Makefile` -- Global makefile: `Makefile` -- Service binary code: `/cmd/` - -# Service Pillars - -Four services make up the pillars of "Unkey Deploy" - -- assetmanagerd -- billaged -- builderd -- metald - -# SIFFE/Spire - -Spire handles mTLS for all service communication diff --git a/go/deploy/assetmanagerd/Makefile b/go/deploy/assetmanagerd/Makefile index dd839d0ec3..f598c05814 100644 --- a/go/deploy/assetmanagerd/Makefile +++ b/go/deploy/assetmanagerd/Makefile @@ -36,11 +36,11 @@ GOLINT := golangci-lint .DEFAULT_GOAL := help # Targets (alphabetically ordered) -.PHONY: all build check clean create-user deps dev env-example fmt generate help install lint lint-proto run service-logs service-logs-tail service-restart service-start service-status service-stop setup test test-coverage uninstall version vet +.PHONY: all build check clean deps dev env-example fmt generate help install lint lint-proto run service-logs service-logs-tail service-restart service-start service-status service-stop setup test test-coverage uninstall version vet -all: clean generate build ## Clean, generate, and build +all: clean build ## Clean, generate, and build -build: generate deps ## Build the binary +build: deps ## Build the binary @mkdir -p $(BUILD_DIR) @$(GOBUILD) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY_NAME) ./cmd/$(SERVICE_NAME) @@ -50,9 +50,6 @@ clean: ## Clean build artifacts @rm -rf $(BUILD_DIR) @rm -rf $(GEN_DIR) -create-user: ## Create service user - @sudo useradd -r -s /bin/false -d $(DATA_DIR) -c "$(SERVICE_NAME) service user" $(SERVICE_NAME) 2>/dev/null || true - deps: ## Download and tidy dependencies @go mod download @go mod tidy @@ -60,29 +57,20 @@ deps: ## Download and tidy dependencies dev: generate ## Run in development mode @go run ./cmd/$(SERVICE_NAME) -env-example: ## Show example environment variables - @echo "Example environment variables for $(SERVICE_NAME):" - @cat .env.example 2>/dev/null || echo "Error: .env.example not found" - fmt: ## Format code @$(GOFMT) -w . -generate: ## Generate protobuf code - @buf generate - @buf lint - help: ## Show this help message @echo 'Usage: make [target]' @echo '' @echo 'Targets:' @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST) -install: build create-user ## Install the service (requires sudo) +install: build ## Install the service (requires sudo) @sudo systemctl stop $(SERVICE_NAME) 2>/dev/null || true @sudo mkdir -p $(CONFIG_DIR) @sudo cp $(BUILD_DIR)/$(BINARY_NAME) $(INSTALL_DIR)/ @sudo chmod +x $(INSTALL_DIR)/$(BINARY_NAME) - @sudo chown $(SERVICE_NAME):$(SERVICE_NAME) $(CONFIG_DIR) @sudo cp contrib/systemd/$(SERVICE_NAME).service $(SYSTEMD_DIR)/ @sudo systemctl daemon-reload @sudo systemctl enable $(SERVICE_NAME) >/dev/null 2>&1 diff --git a/go/deploy/assetmanagerd/contrib/systemd/assetmanagerd.service b/go/deploy/assetmanagerd/contrib/systemd/assetmanagerd.service index fc16c94cb0..c0750e952b 100644 --- a/go/deploy/assetmanagerd/contrib/systemd/assetmanagerd.service +++ b/go/deploy/assetmanagerd/contrib/systemd/assetmanagerd.service @@ -1,8 +1,9 @@ [Unit] Description=AssetManagerd VM Asset Management Service Documentation=https://github.com/unkeyed/unkey/go/deploy/assetmanagerd -After=network.target +After=network.target spire-agent.service Wants=network.target +Requires=spire-agent.service [Service] Type=simple diff --git a/go/deploy/assetmanagerd/go.mod b/go/deploy/assetmanagerd/go.mod index 30c608b12b..583f8c9d7a 100644 --- a/go/deploy/assetmanagerd/go.mod +++ b/go/deploy/assetmanagerd/go.mod @@ -52,7 +52,7 @@ require ( google.golang.org/genproto/googleapis/api v0.0.0-20250707201910-8d1bb00bc6a7 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 // indirect google.golang.org/grpc v1.73.0 // indirect - google.golang.org/protobuf v1.36.6 // indirect + google.golang.org/protobuf v1.36.8 // indirect ) replace github.com/unkeyed/unkey/go/deploy/pkg/tls => ../pkg/tls diff --git a/go/deploy/assetmanagerd/go.sum b/go/deploy/assetmanagerd/go.sum index 993cbc7298..0394b490b2 100644 --- a/go/deploy/assetmanagerd/go.sum +++ b/go/deploy/assetmanagerd/go.sum @@ -94,7 +94,7 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/go/deploy/assetmanagerd/internal/service/service.go b/go/deploy/assetmanagerd/internal/service/service.go index b8dd3911a6..d74ddf38ff 100644 --- a/go/deploy/assetmanagerd/internal/service/service.go +++ b/go/deploy/assetmanagerd/internal/service/service.go @@ -11,12 +11,11 @@ import ( "time" "connectrpc.com/connect" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" "github.com/unkeyed/unkey/go/deploy/assetmanagerd/internal/builderd" "github.com/unkeyed/unkey/go/deploy/assetmanagerd/internal/config" "github.com/unkeyed/unkey/go/deploy/assetmanagerd/internal/registry" "github.com/unkeyed/unkey/go/deploy/assetmanagerd/internal/storage" - "github.com/unkeyed/unkey/go/deploy/pkg/observability/interceptors" + assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" @@ -1038,11 +1037,6 @@ func (s *Service) QueryAssets( tenantID := buildOpts.GetTenantId() customerID := "cli-user" // Default fallback - // Try to extract tenant context for proper customer ID - if tenantCtx, ok := interceptors.TenantFromContext(ctx); ok && tenantCtx.CustomerID != "" { - customerID = tenantCtx.CustomerID - } - buildID, err := s.builderdClient.BuildDockerRootfsWithOptions(buildCtx, dockerImage, buildLabels, tenantID, customerID) if err != nil { buildSpan.RecordError(err) diff --git a/go/deploy/billaged/contrib/systemd/billaged.service b/go/deploy/billaged/contrib/systemd/billaged.service index 11efa08938..3639c302c6 100644 --- a/go/deploy/billaged/contrib/systemd/billaged.service +++ b/go/deploy/billaged/contrib/systemd/billaged.service @@ -1,8 +1,9 @@ [Unit] Description=Billaged VM Usage Billing Service Documentation=https://github.com/unkeyed/unkey/go/deploy/billaged -After=network.target +After=network.target spire-agent.service assetmanagerd.service metald.service Wants=network.target +Requires=spire-agent.service assetmanagerd.service metald.service [Service] Type=simple diff --git a/go/deploy/builderd/Makefile b/go/deploy/builderd/Makefile index f6f044ec13..747c43949f 100644 --- a/go/deploy/builderd/Makefile +++ b/go/deploy/builderd/Makefile @@ -35,11 +35,11 @@ GOLINT := golangci-lint .DEFAULT_GOAL := help # Targets (alphabetically ordered) -.PHONY: all build build-linux check ci clean clean-gen create-user debug deps dev env-example fmt generate help install install-tools lint lint-proto proto-breaking quick-build quick-test release run service-logs service-logs-tail service-restart service-start service-status service-stop setup test test-coverage uninstall version vet +.PHONY: all build build-linux check ci clean clean-gen debug deps dev env-example fmt generate help install install-tools lint lint-proto proto-breaking quick-build quick-test release run service-logs service-logs-tail service-restart service-start service-status service-stop setup test test-coverage uninstall version vet all: clean generate build ## Clean, generate, and build -build: generate deps ## Build the binary +build: deps ## Build the binary @mkdir -p $(BUILD_DIR) @$(GOBUILD) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY_NAME) ./cmd/$(SERVICE_NAME) @@ -56,12 +56,6 @@ clean: ## Clean build artifacts @rm -rf $(GEN_DIR) @rm -f coverage.out coverage.html -clean-gen: ## Clean generated protobuf code - @rm -rf $(GEN_DIR) - -create-user: ## Create service user - @sudo useradd -r -s /bin/false -d /opt/builderd -c "$(SERVICE_NAME) service user" $(SERVICE_NAME) 2>/dev/null || true - debug: build ## Run with debug logging @UNKEY_BUILDERD_LOG_LEVEL=debug ./$(BUILD_DIR)/$(BINARY_NAME) @@ -72,13 +66,6 @@ deps: ## Download and tidy dependencies dev: ## Run in development mode @go run ./cmd/$(SERVICE_NAME) -env-example: ## Show example environment variables - @echo "Example environment variables for $(SERVICE_NAME):" - @echo "UNKEY_BUILDERD_PORT=8082" - @echo "UNKEY_BUILDERD_OTEL_ENABLED=false" - @echo "UNKEY_BUILDERD_STORAGE_BACKEND=local" - @echo "UNKEY_BUILDERD_LOG_LEVEL=info" - fmt: ## Format code @$(GOFMT) -w . @which goimports >/dev/null && goimports -w . || echo "goimports not found, install with: go install golang.org/x/tools/cmd/goimports@latest" @@ -93,29 +80,17 @@ help: ## Show this help message @echo 'Targets:' @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST) -install: build create-user ## Install the service (requires sudo) +install: build ## Install the service (requires sudo) @sudo systemctl stop $(SERVICE_NAME) 2>/dev/null || true @sudo mkdir -p $(CONFIG_DIR) @sudo cp $(BUILD_DIR)/$(BINARY_NAME) $(INSTALL_DIR)/ @sudo chmod +x $(INSTALL_DIR)/$(BINARY_NAME) - @sudo chown $(SERVICE_NAME):$(SERVICE_NAME) $(CONFIG_DIR) @sudo cp contrib/systemd/$(SERVICE_NAME).service $(SYSTEMD_DIR)/ @sudo systemctl daemon-reload @sudo systemctl enable $(SERVICE_NAME) >/dev/null 2>&1 @sudo systemctl start $(SERVICE_NAME) 2>/dev/null || true @echo "✓ $(SERVICE_NAME) installed and started" - -lint: lint-proto ## Run linter (includes protobuf linting) - @which $(GOLINT) >/dev/null || (echo "golangci-lint not found, install with: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $$(go env GOPATH)/bin v1.54.2" && exit 1) - @$(GOLINT) run --disable=godox - -lint-proto: ## Run protobuf linter - @buf lint - -proto-breaking: ## Check for breaking protobuf changes - @buf breaking --against .git#branch=main - quick-build: ## Quick build without optimizations @mkdir -p $(BUILD_DIR) @$(GOBUILD) -o $(BUILD_DIR)/$(BINARY_NAME) ./cmd/$(SERVICE_NAME) @@ -175,4 +150,3 @@ version: ## Show version information vet: ## Run go vet @$(GOVET) ./... - diff --git a/go/deploy/builderd/client/client.go b/go/deploy/builderd/client/client.go index faadfbed1f..df27b56aa0 100644 --- a/go/deploy/builderd/client/client.go +++ b/go/deploy/builderd/client/client.go @@ -7,9 +7,9 @@ import ( "time" "connectrpc.com/connect" + "github.com/unkeyed/unkey/go/deploy/pkg/tls" builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1/builderdv1connect" - "github.com/unkeyed/unkey/go/deploy/pkg/tls" ) // AIDEV-NOTE: Builderd client with SPIFFE/SPIRE socket integration diff --git a/go/deploy/builderd/cmd/builderd/main.go b/go/deploy/builderd/cmd/builderd/main.go index c26ec479ae..8ff9d4c448 100644 --- a/go/deploy/builderd/cmd/builderd/main.go +++ b/go/deploy/builderd/cmd/builderd/main.go @@ -4,12 +4,10 @@ import ( "context" "flag" "fmt" - "io" "log/slog" "net/http" "os" "os/signal" - "path/filepath" "runtime" "runtime/debug" "sync" @@ -19,15 +17,15 @@ import ( "connectrpc.com/connect" "github.com/prometheus/client_golang/prometheus/promhttp" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" - "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1/builderdv1connect" "github.com/unkeyed/unkey/go/deploy/builderd/internal/assetmanager" + "github.com/unkeyed/unkey/go/deploy/builderd/internal/assets" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" "github.com/unkeyed/unkey/go/deploy/builderd/internal/observability" "github.com/unkeyed/unkey/go/deploy/builderd/internal/service" healthpkg "github.com/unkeyed/unkey/go/deploy/pkg/health" "github.com/unkeyed/unkey/go/deploy/pkg/observability/interceptors" tlspkg "github.com/unkeyed/unkey/go/deploy/pkg/tls" + "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1/builderdv1connect" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel" "golang.org/x/net/http2" @@ -214,12 +212,15 @@ func main() { // AIDEV-NOTE: This ensures builderd can create VMs without external setup scripts if cfg.AssetManager.Enabled { logger.Info("initializing base VM assets") - // Temporarily inline the base asset initialization to avoid import issues - // TODO: Move to assets package once imports are stable baseAssetInitCtx, cancel := context.WithTimeout(rootCtx, 10*time.Minute) defer cancel() - if err := initializeBaseAssets(baseAssetInitCtx, logger, cfg, assetClient); err != nil { + // Use proper assets package with metrics and retry logic + assetManager := assets.NewBaseAssetManager(logger, cfg, assetClient) + if buildMetrics != nil { + assetManager = assetManager.WithMetrics(buildMetrics) + } + if err := assetManager.InitializeBaseAssetsWithRetry(baseAssetInitCtx); err != nil { logger.Error("failed to initialize base assets", slog.String("error", err.Error()), ) @@ -315,7 +316,7 @@ func main() { // Start main server with proper error coordination g.Go(func() error { - // Start server in a way that respects context cancellation + // AIDEV-NOTE: Start server with proper context cancellation to prevent startup goroutine deadlock errCh := make(chan error, 1) if serverTLSConfig != nil { @@ -343,6 +344,12 @@ func main() { } return nil case <-gCtx.Done(): + // AIDEV-NOTE: Immediately shutdown server when context is cancelled to prevent deadlock + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := server.Shutdown(shutdownCtx); err != nil { + logger.Warn("server shutdown during startup failed", slog.String("error", err.Error())) + } return gCtx.Err() } }) @@ -372,10 +379,28 @@ func main() { slog.String("address", promAddr), slog.Bool("localhost_only", localhostOnly), ) - if err := promServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { - return fmt.Errorf("prometheus server failed: %w", err) + + // AIDEV-NOTE: Start Prometheus server with context cancellation support + errCh := make(chan error, 1) + go func() { + errCh <- promServer.ListenAndServe() + }() + + select { + case err := <-errCh: + if err != nil && err != http.ErrServerClosed { + return fmt.Errorf("prometheus server failed: %w", err) + } + return nil + case <-gCtx.Done(): + // AIDEV-NOTE: Immediately shutdown Prometheus server when context is cancelled + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := promServer.Shutdown(shutdownCtx); err != nil { + logger.Warn("prometheus server shutdown during startup failed", slog.String("error", err.Error())) + } + return gCtx.Err() } - return nil }) } @@ -383,6 +408,9 @@ func main() { sigChan := make(chan os.Signal, 2) // Buffer for multiple signals signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT) + // AIDEV-NOTE: Signal handling continues during graceful shutdown to prevent SIGABRT panics + shutdownSignalReceived := make(chan struct{}) + // Handle shutdown coordination g.Go(func() error { select { @@ -390,6 +418,7 @@ func main() { logger.Info("received shutdown signal", slog.String("signal", sig.String()), ) + close(shutdownSignalReceived) return fmt.Errorf("shutdown signal received: %s", sig) case <-gCtx.Done(): return gCtx.Err() @@ -403,6 +432,22 @@ func main() { ) } + // Continue handling signals during graceful shutdown to prevent SIGABRT panics + go func() { + for { + select { + case <-shutdownSignalReceived: + // Already shutting down, ignore + return + case sig := <-sigChan: + logger.Warn("received additional signal during shutdown, ignoring", + slog.String("signal", sig.String()), + ) + // Continue listening for more signals + } + } + }() + // Coordinated shutdown with proper ordering performGracefulShutdown(logger, server, promServer, providers, builderService, &shutdownStarted, &shutdownMutex, cfg.Server.ShutdownTimeout) } @@ -544,43 +589,60 @@ func performGracefulShutdown(logger *slog.Logger, server *http.Server, promServe return } + logger.Info("attempting to acquire shutdown mutex") shutdownMutex.Lock() defer shutdownMutex.Unlock() - logger.Info("performing graceful shutdown") + logger.Info("acquired shutdown mutex, performing graceful shutdown") // Create shutdown context with configurable timeout - shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) + // AIDEV-NOTE: Use a shorter timeout to avoid systemd SIGABRT + actualTimeout := shutdownTimeout + if actualTimeout > 12*time.Second { + actualTimeout = 12 * time.Second // Leave 3s buffer before systemd timeout + } + shutdownCtx, cancel := context.WithTimeout(context.Background(), actualTimeout) defer cancel() + logger.Info("starting graceful shutdown with timeout", + slog.Duration("timeout", actualTimeout), + ) + // Use errgroup for coordinated shutdown g, gCtx := errgroup.WithContext(shutdownCtx) // AIDEV-NOTE: Shutdown BuilderService first to stop new builds and wait for running ones - g.Go(func() error { - logger.Info("shutting down BuilderService") - if err := builderService.Shutdown(gCtx); err != nil { - return fmt.Errorf("BuilderService shutdown failed: %w", err) - } - logger.Info("BuilderService shutdown complete") - return nil - }) + if builderService != nil { + g.Go(func() error { + logger.Info("starting BuilderService shutdown") + if err := builderService.Shutdown(gCtx); err != nil { + logger.Error("BuilderService shutdown failed", slog.String("error", err.Error())) + return fmt.Errorf("BuilderService shutdown failed: %w", err) + } + logger.Info("BuilderService shutdown complete") + return nil + }) + } // Shutdown HTTP server - g.Go(func() error { - logger.Info("shutting down HTTP server") - if err := server.Shutdown(gCtx); err != nil { - return fmt.Errorf("HTTP server shutdown failed: %w", err) - } - logger.Info("HTTP server shutdown complete") - return nil - }) + if server != nil { + g.Go(func() error { + logger.Info("starting HTTP server shutdown") + if err := server.Shutdown(gCtx); err != nil { + logger.Error("HTTP server shutdown failed", slog.String("error", err.Error())) + return fmt.Errorf("HTTP server shutdown failed: %w", err) + } + logger.Info("HTTP server shutdown complete") + return nil + }) + } // Shutdown Prometheus server if running if promServer != nil { g.Go(func() error { - logger.Info("shutting down Prometheus server") + logger.Info("starting Prometheus server shutdown") if err := promServer.Shutdown(gCtx); err != nil { + logger.Error("Prometheus server shutdown failed", slog.String("error", err.Error())) return fmt.Errorf("prometheus server shutdown failed: %w", err) } logger.Info("Prometheus server shutdown complete") @@ -591,8 +653,9 @@ func performGracefulShutdown(logger *slog.Logger, server *http.Server, promServe // Shutdown OpenTelemetry providers if providers != nil { g.Go(func() error { - logger.Info("shutting down OpenTelemetry providers") + logger.Info("starting OpenTelemetry providers shutdown") if err := providers.Shutdown(gCtx); err != nil { + logger.Error("OpenTelemetry shutdown failed", slog.String("error", err.Error())) return fmt.Errorf("OpenTelemetry shutdown failed: %w", err) } logger.Info("OpenTelemetry shutdown complete") @@ -610,119 +673,3 @@ func performGracefulShutdown(logger *slog.Logger, server *http.Server, promServe logger.Info("graceful shutdown completed successfully") } - -// initializeBaseAssets downloads and registers base VM assets if they don't exist -func initializeBaseAssets(ctx context.Context, logger *slog.Logger, cfg *config.Config, assetClient *assetmanager.Client) error { - // AIDEV-NOTE: Inline base asset initialization to avoid import cycles - // This logic should eventually be moved to the assets package - - baseAssets := []struct { - name string - url string - assetType assetv1.AssetType - description string - }{ - { - name: "vmlinux", - url: "https://s3.amazonaws.com/spec.ccfc.min/img/quickstart_guide/x86_64/kernels/vmlinux.bin", - assetType: assetv1.AssetType_ASSET_TYPE_KERNEL, - description: "Firecracker x86_64 kernel", - }, - { - name: "rootfs.ext4", - url: "https://s3.amazonaws.com/spec.ccfc.min/img/quickstart_guide/x86_64/rootfs/bionic.rootfs.ext4", - assetType: assetv1.AssetType_ASSET_TYPE_ROOTFS, - description: "Ubuntu Bionic base rootfs", - }, - } - - storageDir := cfg.Builder.RootfsOutputDir - for _, asset := range baseAssets { - logger.InfoContext(ctx, "ensuring base asset is available", - "asset", asset.name, - "type", asset.assetType, - ) - - // Check if asset already exists locally - localPath := filepath.Join(storageDir, "base", asset.name) - if err := os.MkdirAll(filepath.Dir(localPath), 0755); err != nil { - return fmt.Errorf("failed to create directory for %s: %w", asset.name, err) - } - - // Download if not present - if _, err := os.Stat(localPath); os.IsNotExist(err) { - logger.InfoContext(ctx, "downloading base asset", - "asset", asset.name, - "url", asset.url, - ) - - if err := downloadAsset(ctx, asset.url, localPath); err != nil { - return fmt.Errorf("failed to download %s: %w", asset.name, err) - } - - logger.InfoContext(ctx, "asset downloaded successfully", - "asset", asset.name, - "path", localPath, - ) - } - - // Register with assetmanagerd - labels := map[string]string{ - "created_by": "builderd", - "customer_id": "system", - "tenant_id": "system", - "source": "firecracker-quickstart", - "asset_type": asset.name, - "architecture": "x86_64", - } - - assetID, err := assetClient.RegisterBuildArtifact(ctx, "base-assets", localPath, asset.assetType, labels) - if err != nil { - // Log warning but don't fail - asset might already be registered - logger.WarnContext(ctx, "failed to register asset, might already exist", - "asset", asset.name, - "error", err, - ) - } else { - logger.InfoContext(ctx, "asset registered successfully", - "asset", asset.name, - "asset_id", assetID, - ) - } - } - - return nil -} - -// downloadAsset downloads a file from URL to local path -func downloadAsset(ctx context.Context, url, localPath string) error { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - return fmt.Errorf("failed to create request: %w", err) - } - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return fmt.Errorf("failed to download: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("HTTP %d", resp.StatusCode) - } - - tmpPath := localPath + ".tmp" - tmpFile, err := os.Create(tmpPath) - if err != nil { - return fmt.Errorf("failed to create temp file: %w", err) - } - defer os.Remove(tmpPath) - - _, err = io.Copy(tmpFile, resp.Body) - tmpFile.Close() - if err != nil { - return fmt.Errorf("failed to write file: %w", err) - } - - return os.Rename(tmpPath, localPath) -} diff --git a/go/deploy/builderd/cmd/builderd/shutdown_test.go b/go/deploy/builderd/cmd/builderd/shutdown_test.go new file mode 100644 index 0000000000..e1543da5a4 --- /dev/null +++ b/go/deploy/builderd/cmd/builderd/shutdown_test.go @@ -0,0 +1,213 @@ +package main + +import ( + "log/slog" + "net/http" + "os" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" + "github.com/unkeyed/unkey/go/deploy/builderd/internal/observability" + "github.com/unkeyed/unkey/go/deploy/builderd/internal/service" +) + +// TestShutdownRace tests the graceful shutdown sequence for race conditions +func TestShutdownRace(t *testing.T) { + // Create a minimal logger for testing + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelDebug, + })) + + // Create minimal config + cfg := &config.Config{ + Server: config.ServerConfig{ + ShutdownTimeout: 5 * time.Second, + }, + } + + // Create mock servers + server := &http.Server{ + Addr: ":0", // Use any available port + } + promServer := &http.Server{ + Addr: ":0", // Use any available port + } + + // Create minimal providers (can be nil for this test) + var providers *observability.Providers + + // Create minimal builder service (can be nil for this test) + var builderService *service.BuilderService + + // Shutdown coordination variables + var shutdownStarted int64 + var shutdownMutex sync.Mutex + + // Test concurrent shutdown attempts to detect race conditions + const numGoroutines = 10 + var wg sync.WaitGroup + + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func(id int) { + defer wg.Done() + + // Reset shutdown state before each attempt + atomic.StoreInt64(&shutdownStarted, 0) + + // Call performGracefulShutdown concurrently + performGracefulShutdown( + logger.With("goroutine_id", id), + server, + promServer, + providers, + builderService, + &shutdownStarted, + &shutdownMutex, + cfg.Server.ShutdownTimeout, + ) + }(i) + } + + // Wait for all goroutines to complete + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + // Set a reasonable timeout for the test + select { + case <-done: + t.Log("All shutdown goroutines completed successfully") + case <-time.After(10 * time.Second): + t.Fatal("Test timed out - likely a deadlock or race condition") + } +} + +// TestShutdownSequence tests the shutdown sequence with realistic components +func TestShutdownSequence(t *testing.T) { + // Create a logger for testing + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + + // Create config + cfg := &config.Config{ + Server: config.ServerConfig{ + ShutdownTimeout: 2 * time.Second, + }, + OpenTelemetry: config.OpenTelemetryConfig{ + Enabled: false, // Disable OTel to avoid external dependencies + }, + } + + // Create HTTP servers + mux := http.NewServeMux() + mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("OK")) + }) + + server := &http.Server{ + Addr: "127.0.0.1:0", // Use any available port + Handler: mux, + } + + promMux := http.NewServeMux() + promMux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("# Metrics")) + }) + + promServer := &http.Server{ + Addr: "127.0.0.1:0", // Use any available port + Handler: promMux, + } + + // Start servers in background + go func() { + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + t.Logf("HTTP server error: %v", err) + } + }() + + go func() { + if err := promServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + t.Logf("Prometheus server error: %v", err) + } + }() + + // Give servers time to start + time.Sleep(100 * time.Millisecond) + + // Test graceful shutdown + var shutdownStarted int64 + var shutdownMutex sync.Mutex + + start := time.Now() + performGracefulShutdown( + logger, + server, + promServer, + nil, // No OTel providers + nil, // No builder service + &shutdownStarted, + &shutdownMutex, + cfg.Server.ShutdownTimeout, + ) + duration := time.Since(start) + + t.Logf("Shutdown completed in %v", duration) + + // Verify shutdown completed within reasonable time + if duration > 5*time.Second { + t.Errorf("Shutdown took too long: %v", duration) + } +} + +// TestShutdownTimeout tests that shutdown respects the configured timeout +func TestShutdownTimeout(t *testing.T) { + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + + // Create a server that will hang during shutdown + hangingServer := &http.Server{ + Addr: "127.0.0.1:0", + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Simulate long-running request + time.Sleep(10 * time.Second) + w.WriteHeader(http.StatusOK) + }), + } + + // Very short timeout to test timeout behavior + shortTimeout := 100 * time.Millisecond + + var shutdownStarted int64 + var shutdownMutex sync.Mutex + + start := time.Now() + performGracefulShutdown( + logger, + hangingServer, + nil, // No prom server + nil, // No OTel providers + nil, // No builder service + &shutdownStarted, + &shutdownMutex, + shortTimeout, + ) + duration := time.Since(start) + + t.Logf("Shutdown with timeout completed in %v", duration) + + // Should complete within a reasonable time even with hanging components + if duration > 5*time.Second { + t.Errorf("Shutdown took too long even with timeout: %v", duration) + } +} diff --git a/go/deploy/builderd/contrib/systemd/builderd.service b/go/deploy/builderd/contrib/systemd/builderd.service index 4bffe822ee..3f2cf6e5af 100644 --- a/go/deploy/builderd/contrib/systemd/builderd.service +++ b/go/deploy/builderd/contrib/systemd/builderd.service @@ -1,8 +1,9 @@ [Unit] Description=Builderd Multi-Tenant Build Service Documentation=https://github.com/unkeyed/unkey/go/deploy/builderd -After=network.target +After=network.target spire-agent.service assetmanagerd.service Wants=network.target +Requires=spire-agent.service assetmanagerd.service [Service] Type=simple @@ -34,6 +35,7 @@ Environment=UNKEY_BUILDERD_BUILD_TIMEOUT=15m Environment=UNKEY_BUILDERD_SCRATCH_DIR=/opt/builderd/scratch Environment=UNKEY_BUILDERD_ROOTFS_OUTPUT_DIR=/opt/builderd/rootfs Environment=UNKEY_BUILDERD_WORKSPACE_DIR=/opt/builderd/workspace +Environment=UNKEY_BUILDERD_USE_PIPELINE_EXECUTOR=false # Storage configuration Environment=UNKEY_BUILDERD_STORAGE_BACKEND=local diff --git a/go/deploy/builderd/environment.example b/go/deploy/builderd/environment.example deleted file mode 100644 index f372074370..0000000000 --- a/go/deploy/builderd/environment.example +++ /dev/null @@ -1,84 +0,0 @@ -# Builderd Environment Variables Template -# NOTE: This service does NOT load .env files automatically -# Set these variables in your system environment or process manager -# -# Usage examples: -# systemd: EnvironmentFile=/etc/builderd/environment -# Docker: docker run --env-file environment builderd -# Shell: set -a; source environment; set +a; ./builderd - -# Service Configuration -UNKEY_BUILDERD_PORT=8082 -UNKEY_BUILDERD_ADDRESS=0.0.0.0 -UNKEY_BUILDERD_SHUTDOWN_TIMEOUT=15s -UNKEY_BUILDERD_RATE_LIMIT=100 - -# Build Configuration -UNKEY_BUILDERD_MAX_CONCURRENT_BUILDS=5 -UNKEY_BUILDERD_BUILD_TIMEOUT=15m -UNKEY_BUILDERD_SCRATCH_DIR=/tmp/builderd -UNKEY_BUILDERD_ROOTFS_OUTPUT_DIR=/opt/builderd/rootfs -UNKEY_BUILDERD_WORKSPACE_DIR=/opt/builderd/workspace -UNKEY_BUILDERD_CLEANUP_INTERVAL=1h - -# Storage Configuration -UNKEY_BUILDERD_STORAGE_BACKEND=local -UNKEY_BUILDERD_STORAGE_RETENTION_DAYS=30 -UNKEY_BUILDERD_STORAGE_MAX_SIZE_GB=100 -UNKEY_BUILDERD_STORAGE_CACHE_ENABLED=true -UNKEY_BUILDERD_STORAGE_CACHE_MAX_SIZE_GB=50 - -# Docker Configuration -UNKEY_BUILDERD_DOCKER_REGISTRY_AUTH=true -UNKEY_BUILDERD_DOCKER_MAX_IMAGE_SIZE_GB=5 -UNKEY_BUILDERD_DOCKER_ALLOWED_REGISTRIES= -UNKEY_BUILDERD_DOCKER_PULL_TIMEOUT=10m -UNKEY_BUILDERD_DOCKER_REGISTRY_MIRROR= -UNKEY_BUILDERD_DOCKER_INSECURE_REGISTRIES= - -# Tenant Management -UNKEY_BUILDERD_TENANT_DEFAULT_TIER=free -UNKEY_BUILDERD_TENANT_ISOLATION_ENABLED=true -UNKEY_BUILDERD_TENANT_QUOTA_CHECK_INTERVAL=5m - -# Tenant Resource Limits -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_MEMORY_BYTES=2147483648 -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_CPU_CORES=2 -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_DISK_BYTES=10737418240 -UNKEY_BUILDERD_TENANT_DEFAULT_TIMEOUT_SECONDS=900 -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_CONCURRENT_BUILDS=3 -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_DAILY_BUILDS=100 -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_STORAGE_BYTES=53687091200 -UNKEY_BUILDERD_TENANT_DEFAULT_MAX_BUILD_TIME_MINUTES=30 - -# Database Configuration -UNKEY_BUILDERD_DATABASE_DATA_DIR=/opt/builderd/data -UNKEY_BUILDERD_DATABASE_TYPE=sqlite -UNKEY_BUILDERD_DATABASE_HOST=localhost -UNKEY_BUILDERD_DATABASE_PORT=5432 -UNKEY_BUILDERD_DATABASE_NAME=builderd -UNKEY_BUILDERD_DATABASE_USERNAME=builderd -UNKEY_BUILDERD_DATABASE_PASSWORD= -UNKEY_BUILDERD_DATABASE_SSL_MODE=disable - -# Asset Manager Integration -UNKEY_BUILDERD_ASSETMANAGER_ENABLED=true -UNKEY_BUILDERD_ASSETMANAGER_ENDPOINT=https://localhost:8083 - -# TLS Configuration -UNKEY_BUILDERD_TLS_MODE=spiffe -UNKEY_BUILDERD_SPIFFE_SOCKET=/var/lib/spire/agent/agent.sock -UNKEY_BUILDERD_TLS_CERT_FILE= -UNKEY_BUILDERD_TLS_KEY_FILE= -UNKEY_BUILDERD_TLS_CA_FILE= - -# OpenTelemetry Configuration -UNKEY_BUILDERD_OTEL_ENABLED=false -UNKEY_BUILDERD_OTEL_SERVICE_NAME=builderd -UNKEY_BUILDERD_OTEL_SERVICE_VERSION=0.1.0 -UNKEY_BUILDERD_OTEL_SAMPLING_RATE=1.0 -UNKEY_BUILDERD_OTEL_ENDPOINT=localhost:4318 -UNKEY_BUILDERD_OTEL_PROMETHEUS_ENABLED=true -UNKEY_BUILDERD_OTEL_PROMETHEUS_PORT=9466 -UNKEY_BUILDERD_OTEL_PROMETHEUS_INTERFACE=127.0.0.1 -UNKEY_BUILDERD_OTEL_HIGH_CARDINALITY_ENABLED=false \ No newline at end of file diff --git a/go/deploy/builderd/go.mod b/go/deploy/builderd/go.mod index a96b72c559..8ff9300fa8 100644 --- a/go/deploy/builderd/go.mod +++ b/go/deploy/builderd/go.mod @@ -22,7 +22,7 @@ require ( golang.org/x/net v0.42.0 golang.org/x/sync v0.16.0 golang.org/x/time v0.12.0 - google.golang.org/protobuf v1.36.6 + google.golang.org/protobuf v1.36.8 ) require ( diff --git a/go/deploy/builderd/go.sum b/go/deploy/builderd/go.sum index 4bcdb57ff9..ca414c06a0 100644 --- a/go/deploy/builderd/go.sum +++ b/go/deploy/builderd/go.sum @@ -93,7 +93,7 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/go/deploy/builderd/internal/assetmanager/client.go b/go/deploy/builderd/internal/assetmanager/client.go index ecf122467b..c574713af7 100644 --- a/go/deploy/builderd/internal/assetmanager/client.go +++ b/go/deploy/builderd/internal/assetmanager/client.go @@ -9,10 +9,10 @@ import ( "os" "path/filepath" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" - "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1/assetmanagerdv1connect" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" "github.com/unkeyed/unkey/go/deploy/pkg/tls" + assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" + "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1/assetmanagerdv1connect" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" ) diff --git a/go/deploy/builderd/internal/assets/base.go b/go/deploy/builderd/internal/assets/base.go index e7ebf29ad1..d18f8fc76e 100644 --- a/go/deploy/builderd/internal/assets/base.go +++ b/go/deploy/builderd/internal/assets/base.go @@ -6,13 +6,16 @@ import ( "fmt" "io" "log/slog" + "maps" "net/http" "os" "path/filepath" + "strings" + "time" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" "github.com/unkeyed/unkey/go/deploy/builderd/internal/assetmanager" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" + assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" ) // BaseAssetManager handles initialization and registration of base VM assets @@ -21,6 +24,13 @@ type BaseAssetManager struct { config *config.Config assetClient *assetmanager.Client storageDir string + metrics MetricsRecorder +} + +// MetricsRecorder interface for recording asset initialization metrics +type MetricsRecorder interface { + RecordBaseAssetInitRetry(ctx context.Context, attempt int, reason string) + RecordBaseAssetInitFailure(ctx context.Context, totalAttempts int, finalError string) } // BaseAsset represents a base asset that needs to be downloaded and registered @@ -39,9 +49,71 @@ func NewBaseAssetManager(logger *slog.Logger, cfg *config.Config, assetClient *a config: cfg, assetClient: assetClient, storageDir: cfg.Builder.RootfsOutputDir, + metrics: nil, // No metrics by default } } +// WithMetrics adds metrics recording to the asset manager +func (m *BaseAssetManager) WithMetrics(metrics MetricsRecorder) *BaseAssetManager { + m.metrics = metrics + return m +} + +// InitializeBaseAssetsWithRetry ensures all required base assets are available with retry logic +func (m *BaseAssetManager) InitializeBaseAssetsWithRetry(ctx context.Context) error { + maxRetries := 8 // ~4-ish minutes total with exponential backoff: 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s + + for attempt := range maxRetries { + if attempt > 0 { + delay := time.Duration(1< 0 { + m.logger.InfoContext(ctx, "base asset initialization succeeded after retries", + "successful_attempt", attempt+1, + ) + } + return nil // Success + } + + // Log the error + m.logger.WarnContext(ctx, "base asset initialization attempt failed", + "attempt", attempt+1, + "max_retries", maxRetries, + "error", err, + ) + + // Record retry metric if metrics are available + if m.metrics != nil && attempt > 0 { + m.metrics.RecordBaseAssetInitRetry(ctx, attempt, err.Error()) + } + + // Don't retry if it's the last attempt + if attempt == maxRetries-1 { + // Record final failure metric if metrics are available + if m.metrics != nil { + m.metrics.RecordBaseAssetInitFailure(ctx, maxRetries, err.Error()) + } + return fmt.Errorf("failed to initialize base assets after %d attempts: %w", maxRetries, err) + } + } + + return nil // Should never reach here +} + // InitializeBaseAssets ensures all required base assets are available func (m *BaseAssetManager) InitializeBaseAssets(ctx context.Context) error { // AIDEV-NOTE: Base assets required for VM creation @@ -152,6 +224,7 @@ func (m *BaseAssetManager) downloadAsset(ctx context.Context, asset BaseAsset, l return fmt.Errorf("failed to create request: %w", err) } + // TODO: replace with shared configured http client resp, err := http.DefaultClient.Do(req) if err != nil { return fmt.Errorf("failed to download asset: %w", err) @@ -170,7 +243,6 @@ func (m *BaseAssetManager) downloadAsset(ctx context.Context, asset BaseAsset, l } defer os.Remove(tmpPath) - // Copy with progress written, err := io.Copy(tmpFile, resp.Body) if err != nil { tmpFile.Close() @@ -208,9 +280,8 @@ func (m *BaseAssetManager) registerAsset(ctx context.Context, asset BaseAsset, l // Prepare labels labels := make(map[string]string) - for k, v := range asset.Labels { - labels[k] = v - } + maps.Copy(labels, asset.Labels) + labels["created_by"] = "builderd" labels["customer_id"] = "system" labels["tenant_id"] = "system" @@ -224,7 +295,19 @@ func (m *BaseAssetManager) registerAsset(ctx context.Context, asset BaseAsset, l // Register via assetmanager client assetID, err := m.assetClient.RegisterBuildArtifact(ctx, "base-assets", localPath, asset.Type, labels) if err != nil { - return fmt.Errorf("failed to register asset: %w", err) + // Already exists errors are fine, connection errors should cause retry + errStr := strings.ToLower(err.Error()) + if strings.Contains(errStr, "already exists") || strings.Contains(errStr, "duplicate") || + strings.Contains(errStr, "conflict") { + m.logger.InfoContext(ctx, "base asset already registered, skipping", + "asset", asset.Name, + "error", err, + ) + return nil // Success - asset already registered + } else { + // This is likely a connection/service unavailable error - should trigger retry + return fmt.Errorf("failed to register base asset %s (service may not be ready): %w", asset.Name, err) + } } m.logger.InfoContext(ctx, "asset registered successfully", diff --git a/go/deploy/builderd/internal/config/config.go b/go/deploy/builderd/internal/config/config.go index 20d62519e1..ecbec4540f 100644 --- a/go/deploy/builderd/internal/config/config.go +++ b/go/deploy/builderd/internal/config/config.go @@ -37,6 +37,7 @@ type BuilderConfig struct { RootfsOutputDir string `yaml:"rootfs_output_dir"` WorkspaceDir string `yaml:"workspace_dir"` CleanupInterval time.Duration `yaml:"cleanup_interval"` + UsePipelineExecutor bool `yaml:"use_pipeline_executor"` // Feature flag for step-based execution } // StorageConfig holds storage backend configuration @@ -165,6 +166,7 @@ func LoadConfigWithLogger(logger *slog.Logger) (*Config, error) { RootfsOutputDir: getEnvOrDefault("UNKEY_BUILDERD_ROOTFS_OUTPUT_DIR", "/opt/builderd/rootfs"), WorkspaceDir: getEnvOrDefault("UNKEY_BUILDERD_WORKSPACE_DIR", "/opt/builderd/workspace"), CleanupInterval: getEnvDurationOrDefault("UNKEY_BUILDERD_CLEANUP_INTERVAL", 1*time.Hour), + UsePipelineExecutor: getEnvBoolOrDefault("UNKEY_BUILDERD_USE_PIPELINE_EXECUTOR", false), }, Storage: StorageConfig{ //nolint:exhaustruct // S3Config and GCSConfig are optional backend-specific configs Backend: getEnvOrDefault("UNKEY_BUILDERD_STORAGE_BACKEND", "local"), diff --git a/go/deploy/builderd/internal/executor/docker.go b/go/deploy/builderd/internal/executor/docker.go index 7a55f71a3c..43057da626 100644 --- a/go/deploy/builderd/internal/executor/docker.go +++ b/go/deploy/builderd/internal/executor/docker.go @@ -16,10 +16,9 @@ import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" - builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" "github.com/unkeyed/unkey/go/deploy/builderd/internal/observability" - "github.com/unkeyed/unkey/go/deploy/pkg/observability/interceptors" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" ) // DockerExecutor handles Docker image extraction to rootfs @@ -51,14 +50,7 @@ func (d *DockerExecutor) ExtractDockerImage(ctx context.Context, request *builde func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request *builderv1.CreateBuildRequest, buildID string) (*BuildResult, error) { start := time.Now() - // Get tenant context for logging and metrics - tenantID := "unknown" - if auth, ok := interceptors.TenantFromContext(ctx); ok { - tenantID = auth.TenantID - } - logger := d.logger.With( - slog.String("tenant_id", tenantID), slog.String("image_uri", request.GetConfig().GetSource().GetDockerImage().GetImageUri()), ) @@ -66,7 +58,7 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * // Record build start metrics if d.buildMetrics != nil { - d.buildMetrics.RecordBuildStart(ctx, "docker", "docker", tenantID) + d.buildMetrics.RecordBuildStart(ctx, "docker", "docker", "") } defer func() { @@ -115,7 +107,7 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * slog.String("image", fullImageName), ) if d.buildMetrics != nil { - d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", tenantID, time.Since(start), false) + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), false) } return nil, fmt.Errorf("failed to pull Docker image: %w", err) } @@ -128,7 +120,7 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * slog.String("image", fullImageName), ) if d.buildMetrics != nil { - d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", tenantID, time.Since(start), false) + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), false) } return nil, fmt.Errorf("failed to create container: %w", err) } @@ -148,7 +140,7 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * slog.String("image", fullImageName), ) if d.buildMetrics != nil { - d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", tenantID, time.Since(start), false) + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), false) } return nil, fmt.Errorf("failed to extract container metadata: %w", err) } @@ -161,7 +153,7 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * slog.String("rootfs_dir", rootfsDir), ) if d.buildMetrics != nil { - d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", tenantID, time.Since(start), false) + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), false) } return nil, fmt.Errorf("failed to extract filesystem: %w", err) } @@ -199,7 +191,6 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * SourceImage: fullImageName, RootfsPath: ext4Path, // Use the ext4 image path instead of directory WorkspaceDir: workspaceDir, - TenantID: tenantID, StartTime: start, EndTime: time.Now(), Status: "completed", @@ -208,7 +199,7 @@ func (d *DockerExecutor) ExtractDockerImageWithID(ctx context.Context, request * // Record successful build if d.buildMetrics != nil { - d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", tenantID, time.Since(start), true) + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), true) } logger.InfoContext(ctx, "Docker image extraction successful", diff --git a/go/deploy/builderd/internal/executor/docker_pipeline.go b/go/deploy/builderd/internal/executor/docker_pipeline.go new file mode 100644 index 0000000000..3a6ce631a0 --- /dev/null +++ b/go/deploy/builderd/internal/executor/docker_pipeline.go @@ -0,0 +1,200 @@ +package executor + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + "time" + + "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" + "github.com/unkeyed/unkey/go/deploy/builderd/internal/observability" + "github.com/unkeyed/unkey/go/deploy/pkg/observability/interceptors" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" +) + +// DockerPipelineExecutor wraps the existing DockerExecutor with step-based execution +type DockerPipelineExecutor struct { + dockerExecutor *DockerExecutor + pipeline *BuildPipeline + logger *slog.Logger + config *config.Config + buildMetrics *observability.BuildMetrics +} + +// NewDockerPipelineExecutor creates a new pipeline-based Docker executor +func NewDockerPipelineExecutor(logger *slog.Logger, cfg *config.Config, metrics *observability.BuildMetrics) *DockerPipelineExecutor { + dockerExecutor := NewDockerExecutor(logger, cfg, metrics) + pipeline := NewDockerBuildPipeline(dockerExecutor) + + return &DockerPipelineExecutor{ + dockerExecutor: dockerExecutor, + pipeline: pipeline, + logger: logger, + config: cfg, + buildMetrics: metrics, + } +} + +// ExtractDockerImageWithID executes the full Docker build pipeline +func (d *DockerPipelineExecutor) ExtractDockerImageWithID(ctx context.Context, request *builderv1.CreateBuildRequest, buildID string) (*BuildResult, error) { + start := time.Now() + + // Get tenant context for logging and metrics + tenantID := "unknown" + if auth, ok := interceptors.TenantFromContext(ctx); ok { + tenantID = auth.TenantID + } + + logger := d.logger.With( + slog.String("tenant_id", tenantID), + slog.String("build_id", buildID), + slog.String("image_uri", request.GetConfig().GetSource().GetDockerImage().GetImageUri()), + ) + + logger.InfoContext(ctx, "starting Docker pipeline build") + + // Record build start metrics + if d.buildMetrics != nil { + d.buildMetrics.RecordBuildStart(ctx, "docker", "docker", tenantID) + } + + defer func() { + duration := time.Since(start) + logger.InfoContext(ctx, "Docker pipeline build completed", slog.Duration("duration", duration)) + }() + + dockerSource := request.GetConfig().GetSource().GetDockerImage() + if dockerSource == nil { + return nil, fmt.Errorf("docker image source is required") + } + + // Setup directories + workspaceDir := filepath.Join(d.config.Builder.WorkspaceDir, buildID) + rootfsDir := filepath.Join(d.config.Builder.RootfsOutputDir, buildID) + + logger = logger.With( + slog.String("workspace_dir", workspaceDir), + slog.String("rootfs_dir", rootfsDir), + ) + + // Create directories + if err := os.MkdirAll(workspaceDir, 0755); err != nil { + logger.ErrorContext(ctx, "failed to create workspace directory", slog.String("error", err.Error())) + return nil, fmt.Errorf("failed to create workspace directory: %w", err) + } + + if err := os.MkdirAll(rootfsDir, 0755); err != nil { + logger.ErrorContext(ctx, "failed to create rootfs directory", slog.String("error", err.Error())) + return nil, fmt.Errorf("failed to create rootfs directory: %w", err) + } + + // Prepare initial step input + input := StepInput{ + BuildID: buildID, + Config: request.GetConfig(), + WorkspaceDir: workspaceDir, + RootfsDir: rootfsDir, + Logger: logger, + } + + // Execute the pipeline + result, err := d.pipeline.Execute(ctx, input) + if err != nil { + logger.ErrorContext(ctx, "pipeline execution failed", slog.String("error", err.Error())) + if d.buildMetrics != nil { + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), false) + } + return nil, err + } + + // Record success metrics + if d.buildMetrics != nil { + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), true) + } + + return result, nil +} + +// ResumeBuild resumes a build from a specific step +func (d *DockerPipelineExecutor) ResumeBuild(ctx context.Context, request *builderv1.CreateBuildRequest, buildID string, lastCompletedStep int) (*BuildResult, error) { + start := time.Now() + + // Get tenant context for logging and metrics + tenantID := "unknown" + if auth, ok := interceptors.TenantFromContext(ctx); ok { + tenantID = auth.TenantID + } + + logger := d.logger.With( + slog.String("tenant_id", tenantID), + slog.String("build_id", buildID), + slog.String("image_uri", request.GetConfig().GetSource().GetDockerImage().GetImageUri()), + slog.Int("resume_from_step", lastCompletedStep+1), + ) + + logger.InfoContext(ctx, "resuming Docker pipeline build") + + // Setup directories (should already exist) + workspaceDir := filepath.Join(d.config.Builder.WorkspaceDir, buildID) + rootfsDir := filepath.Join(d.config.Builder.RootfsOutputDir, buildID) + + // Prepare step input - we'd need to reconstruct state from previous steps here + // For simplicity, we're starting fresh but skipping completed steps + input := StepInput{ + BuildID: buildID, + Config: request.GetConfig(), + WorkspaceDir: workspaceDir, + RootfsDir: rootfsDir, + Logger: logger, + // TODO: Restore ImageName, ContainerID, Metadata from previous execution + } + + // Resume from the next step after the last completed one + result, err := d.pipeline.Resume(ctx, input, lastCompletedStep+1) + if err != nil { + logger.ErrorContext(ctx, "pipeline resumption failed", slog.String("error", err.Error())) + if d.buildMetrics != nil { + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), false) + } + return nil, err + } + + // Record success metrics + if d.buildMetrics != nil { + d.buildMetrics.RecordBuildComplete(ctx, "docker", "docker", time.Since(start), true) + } + + return result, nil +} + +// GetStepNames returns the names of all steps in the pipeline +func (d *DockerPipelineExecutor) GetStepNames() []string { + steps := make([]string, len(d.pipeline.steps)) + for i, step := range d.pipeline.steps { + steps[i] = step.Name() + } + return steps +} + +// Execute implements the Executor interface (generates build ID) +func (d *DockerPipelineExecutor) Execute(ctx context.Context, request *builderv1.CreateBuildRequest) (*BuildResult, error) { + // Generate build ID for backward compatibility + return d.ExtractDockerImageWithID(ctx, request, generateBuildID()) +} + +// ExecuteWithID implements the Executor interface (uses provided build ID) +func (d *DockerPipelineExecutor) ExecuteWithID(ctx context.Context, request *builderv1.CreateBuildRequest, buildID string) (*BuildResult, error) { + return d.ExtractDockerImageWithID(ctx, request, buildID) +} + +// GetSupportedSources implements the Executor interface +func (d *DockerPipelineExecutor) GetSupportedSources() []string { + return []string{"docker"} +} + +// Cleanup implements the Executor interface - delegates to underlying DockerExecutor +func (d *DockerPipelineExecutor) Cleanup(ctx context.Context, buildID string) error { + return d.dockerExecutor.Cleanup(ctx, buildID) +} diff --git a/go/deploy/builderd/internal/executor/docker_steps.go b/go/deploy/builderd/internal/executor/docker_steps.go new file mode 100644 index 0000000000..69c3be1992 --- /dev/null +++ b/go/deploy/builderd/internal/executor/docker_steps.go @@ -0,0 +1,175 @@ +package executor + +import ( + "context" + "log/slog" +) + +// PullImageStep pulls the Docker image +type PullImageStep struct { + executor *DockerExecutor +} + +func (s *PullImageStep) Name() string { + return "pull_image" +} + +func (s *PullImageStep) Execute(ctx context.Context, input StepInput) (StepOutput, error) { + imageName := input.Config.GetSource().GetDockerImage().GetImageUri() + + input.Logger.InfoContext(ctx, "pulling Docker image", slog.String("image", imageName)) + + err := s.executor.pullDockerImage(ctx, input.Logger, imageName) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + return StepOutput{ + ImageName: imageName, + Success: true, + }, nil +} + +// CreateContainerStep creates a container from the image +type CreateContainerStep struct { + executor *DockerExecutor +} + +func (s *CreateContainerStep) Name() string { + return "create_container" +} + +func (s *CreateContainerStep) Execute(ctx context.Context, input StepInput) (StepOutput, error) { + input.Logger.InfoContext(ctx, "creating container", slog.String("image", input.ImageName)) + + containerID, err := s.executor.createContainer(ctx, input.Logger, input.ImageName) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + return StepOutput{ + ImageName: input.ImageName, + ContainerID: containerID, + Success: true, + }, nil +} + +// ExtractMetadataStep extracts container metadata +type ExtractMetadataStep struct { + executor *DockerExecutor +} + +func (s *ExtractMetadataStep) Name() string { + return "extract_metadata" +} + +func (s *ExtractMetadataStep) Execute(ctx context.Context, input StepInput) (StepOutput, error) { + input.Logger.InfoContext(ctx, "extracting container metadata", slog.String("image", input.ImageName)) + + metadata, err := s.executor.extractContainerMetadata(ctx, input.Logger, input.ImageName) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + return StepOutput{ + ImageName: input.ImageName, + ContainerID: input.ContainerID, + Metadata: metadata, + Success: true, + }, nil +} + +// ExtractFilesystemStep extracts the container filesystem +type ExtractFilesystemStep struct { + executor *DockerExecutor +} + +func (s *ExtractFilesystemStep) Name() string { + return "extract_filesystem" +} + +func (s *ExtractFilesystemStep) Execute(ctx context.Context, input StepInput) (StepOutput, error) { + input.Logger.InfoContext(ctx, "extracting container filesystem", + slog.String("container_id", input.ContainerID), + slog.String("rootfs_dir", input.RootfsDir)) + + err := s.executor.extractFilesystem(ctx, input.Logger, input.ContainerID, input.RootfsDir, input.Metadata) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + return StepOutput{ + ImageName: input.ImageName, + ContainerID: input.ContainerID, + Metadata: input.Metadata, + RootfsPath: input.RootfsDir, + Success: true, + }, nil +} + +// OptimizeRootfsStep optimizes the extracted rootfs +type OptimizeRootfsStep struct { + executor *DockerExecutor +} + +func (s *OptimizeRootfsStep) Name() string { + return "optimize_rootfs" +} + +func (s *OptimizeRootfsStep) Execute(ctx context.Context, input StepInput) (StepOutput, error) { + input.Logger.InfoContext(ctx, "optimizing rootfs", slog.String("rootfs_dir", input.RootfsDir)) + + // Call existing optimization logic from DockerExecutor + // This would include: creating metald-init, container command/env files, etc. + err := s.executor.injectMetaldInit(ctx, input.Logger, input.RootfsDir) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + err = s.executor.createContainerCmd(ctx, input.Logger, input.RootfsDir, input.Metadata) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + err = s.executor.createContainerEnv(ctx, input.Logger, input.RootfsDir, input.Metadata) + if err != nil { + return StepOutput{Success: false, Error: err}, err + } + + return StepOutput{ + ImageName: input.ImageName, + ContainerID: input.ContainerID, + Metadata: input.Metadata, + RootfsPath: input.RootfsDir, + Success: true, + }, nil +} + +// CleanupStep cleans up temporary resources +type CleanupStep struct { + executor *DockerExecutor +} + +func (s *CleanupStep) Name() string { + return "cleanup" +} + +func (s *CleanupStep) Execute(ctx context.Context, input StepInput) (StepOutput, error) { + input.Logger.InfoContext(ctx, "cleaning up container", slog.String("container_id", input.ContainerID)) + + if input.ContainerID != "" { + err := s.executor.removeContainer(ctx, input.Logger, input.ContainerID) + if err != nil { + // Log warning but don't fail the build for cleanup errors + input.Logger.WarnContext(ctx, "failed to cleanup container", slog.String("error", err.Error())) + } + } + + return StepOutput{ + ImageName: input.ImageName, + ContainerID: "", // Container is now removed + Metadata: input.Metadata, + RootfsPath: input.RootfsDir, + Success: true, + }, nil +} diff --git a/go/deploy/builderd/internal/executor/registry.go b/go/deploy/builderd/internal/executor/registry.go index be308ab10f..9b66f6ff59 100644 --- a/go/deploy/builderd/internal/executor/registry.go +++ b/go/deploy/builderd/internal/executor/registry.go @@ -6,9 +6,9 @@ import ( "log/slog" "sync" - builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" "github.com/unkeyed/unkey/go/deploy/builderd/internal/observability" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" ) // Registry manages different build executors @@ -35,9 +35,16 @@ func NewRegistry(logger *slog.Logger, cfg *config.Config, buildMetrics *observab // registerBuiltinExecutors registers the standard executors func (r *Registry) registerBuiltinExecutors(buildMetrics *observability.BuildMetrics) { - // Register Docker executor - dockerExecutor := NewDockerExecutor(r.logger, r.config, buildMetrics) - r.RegisterExecutor("docker", dockerExecutor) + // Register Docker executor based on feature flag + if r.config.Builder.UsePipelineExecutor { + r.logger.InfoContext(context.Background(), "using step-based pipeline executor for Docker builds") + pipelineExecutor := NewDockerPipelineExecutor(r.logger, r.config, buildMetrics) + r.RegisterExecutor("docker", pipelineExecutor) + } else { + r.logger.InfoContext(context.Background(), "using monolithic executor for Docker builds") + dockerExecutor := NewDockerExecutor(r.logger, r.config, buildMetrics) + r.RegisterExecutor("docker", dockerExecutor) + } // TODO: Register other executors // gitExecutor := NewGitExecutor(r.logger, r.config, buildMetrics) @@ -48,6 +55,7 @@ func (r *Registry) registerBuiltinExecutors(buildMetrics *observability.BuildMet r.logger.InfoContext(context.Background(), "registered built-in executors", slog.Int("executor_count", len(r.executors)), + slog.Bool("pipeline_mode", r.config.Builder.UsePipelineExecutor), ) } diff --git a/go/deploy/builderd/internal/executor/steps.go b/go/deploy/builderd/internal/executor/steps.go new file mode 100644 index 0000000000..4d6f287630 --- /dev/null +++ b/go/deploy/builderd/internal/executor/steps.go @@ -0,0 +1,121 @@ +package executor + +import ( + "context" + "log/slog" + + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" +) + +// StepInput contains the input data for a build step +type StepInput struct { + BuildID string + Config *builderv1.BuildConfig + WorkspaceDir string + RootfsDir string + Logger *slog.Logger + + // Output from previous steps + ImageName string + ContainerID string + Metadata *builderv1.ImageMetadata +} + +// StepOutput contains the output data from a build step +type StepOutput struct { + // Data to pass to next step + ImageName string + ContainerID string + Metadata *builderv1.ImageMetadata + RootfsPath string + + // Step completion info + Success bool + Error error +} + +// StepExecutor represents a single build step +type StepExecutor interface { + Execute(ctx context.Context, input StepInput) (StepOutput, error) + Name() string +} + +// BuildPipeline represents a sequence of build steps +type BuildPipeline struct { + steps []StepExecutor +} + +// NewDockerBuildPipeline creates a pipeline for Docker image builds +func NewDockerBuildPipeline(dockerExecutor *DockerExecutor) *BuildPipeline { + return &BuildPipeline{ + steps: []StepExecutor{ + &PullImageStep{executor: dockerExecutor}, + &CreateContainerStep{executor: dockerExecutor}, + &ExtractMetadataStep{executor: dockerExecutor}, + &ExtractFilesystemStep{executor: dockerExecutor}, + &OptimizeRootfsStep{executor: dockerExecutor}, + &CleanupStep{executor: dockerExecutor}, + }, + } +} + +// Execute runs the entire pipeline +func (p *BuildPipeline) Execute(ctx context.Context, initialInput StepInput) (*BuildResult, error) { + input := initialInput + + for i, step := range p.steps { + initialInput.Logger.InfoContext(ctx, "executing build step", + slog.String("step", step.Name()), + slog.Int("step_index", i), + slog.Int("total_steps", len(p.steps)), + ) + + output, err := step.Execute(ctx, input) + if err != nil { + return nil, err + } + + // Prepare input for next step + input.ImageName = output.ImageName + input.ContainerID = output.ContainerID + input.Metadata = output.Metadata + } + + // Return final build result + return &BuildResult{ + RootfsPath: input.RootfsDir, + ImageMetadata: input.Metadata, + }, nil +} + +// Resume executes the pipeline starting from a specific step +func (p *BuildPipeline) Resume(ctx context.Context, input StepInput, startStepIndex int) (*BuildResult, error) { + input.Logger.InfoContext(ctx, "resuming build pipeline", + slog.Int("start_step", startStepIndex), + slog.Int("total_steps", len(p.steps)), + ) + + for i := startStepIndex; i < len(p.steps); i++ { + step := p.steps[i] + + input.Logger.InfoContext(ctx, "executing build step (resumed)", + slog.String("step", step.Name()), + slog.Int("step_index", i), + ) + + output, err := step.Execute(ctx, input) + if err != nil { + return nil, err + } + + // Prepare input for next step + input.ImageName = output.ImageName + input.ContainerID = output.ContainerID + input.Metadata = output.Metadata + } + + return &BuildResult{ + RootfsPath: input.RootfsDir, + ImageMetadata: input.Metadata, + }, nil +} diff --git a/go/deploy/builderd/internal/observability/metrics.go b/go/deploy/builderd/internal/observability/metrics.go index 0714a7a756..859ce3e616 100644 --- a/go/deploy/builderd/internal/observability/metrics.go +++ b/go/deploy/builderd/internal/observability/metrics.go @@ -42,9 +42,9 @@ type BuildMetrics struct { buildStepErrors metric.Int64Counter buildStepDuration metric.Float64Histogram - // Tenant metrics (if high cardinality enabled) - tenantBuildsTotal metric.Int64Counter - tenantQuotaViolations metric.Int64Counter + // Base asset initialization metrics + baseAssetInitRetries metric.Int64Counter + baseAssetInitFailures metric.Int64Counter highCardinalityEnabled bool logger *slog.Logger @@ -266,25 +266,23 @@ func NewBuildMetrics(logger *slog.Logger, highCardinalityEnabled bool) (*BuildMe return nil, err } - // Tenant metrics (if enabled) - if highCardinalityEnabled { - metrics.tenantBuildsTotal, err = meter.Int64Counter( - "builderd_tenant_builds_total", - metric.WithDescription("Total builds per tenant"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } + // Base asset initialization metrics + metrics.baseAssetInitRetries, err = meter.Int64Counter( + "builderd_base_asset_init_retries_total", + metric.WithDescription("Total number of base asset initialization retries"), + metric.WithUnit("1"), + ) + if err != nil { + return nil, err + } - metrics.tenantQuotaViolations, err = meter.Int64Counter( - "builderd_tenant_quota_violations_total", - metric.WithDescription("Total quota violations per tenant"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } + metrics.baseAssetInitFailures, err = meter.Int64Counter( + "builderd_base_asset_init_failures_total", + metric.WithDescription("Total number of base asset initialization final failures"), + metric.WithUnit("1"), + ) + if err != nil { + return nil, err } logger.Info("build metrics initialized", @@ -307,11 +305,10 @@ func (m *BuildMetrics) RecordBuildStart(ctx context.Context, buildType, sourceTy } // RecordBuildComplete records the completion of a build -func (m *BuildMetrics) RecordBuildComplete(ctx context.Context, buildType, sourceType, tenantTier string, duration time.Duration, success bool) { +func (m *BuildMetrics) RecordBuildComplete(ctx context.Context, buildType, sourceType string, duration time.Duration, success bool) { attrs := []attribute.KeyValue{ attribute.String("build_type", buildType), attribute.String("source_type", sourceType), - attribute.String("tenant_tier", tenantTier), attribute.String("status", func() string { if success { return "success" @@ -329,11 +326,10 @@ func (m *BuildMetrics) RecordBuildComplete(ctx context.Context, buildType, sourc } // RecordBuildCancellation records a build cancellation -func (m *BuildMetrics) RecordBuildCancellation(ctx context.Context, buildType, sourceType, tenantTier string) { +func (m *BuildMetrics) RecordBuildCancellation(ctx context.Context, buildType, sourceType string) { attrs := []attribute.KeyValue{ attribute.String("build_type", buildType), attribute.String("source_type", sourceType), - attribute.String("tenant_tier", tenantTier), } m.buildCancellations.Add(ctx, 1, metric.WithAttributes(attrs...)) @@ -399,34 +395,6 @@ func (m *BuildMetrics) RecordDequeuedBuild(ctx context.Context) { m.queuedBuilds.Add(ctx, -1) } -// RecordTenantBuild records a build for a specific tenant (if high cardinality enabled) -func (m *BuildMetrics) RecordTenantBuild(ctx context.Context, tenantID, buildType string) { - if !m.highCardinalityEnabled || m.tenantBuildsTotal == nil { - return - } - - attrs := []attribute.KeyValue{ - attribute.String("tenant_id", tenantID), - attribute.String("build_type", buildType), - } - - m.tenantBuildsTotal.Add(ctx, 1, metric.WithAttributes(attrs...)) -} - -// RecordTenantQuotaViolation records a quota violation for a tenant -func (m *BuildMetrics) RecordTenantQuotaViolation(ctx context.Context, tenantID, quotaType string) { - if !m.highCardinalityEnabled || m.tenantQuotaViolations == nil { - return - } - - attrs := []attribute.KeyValue{ - attribute.String("tenant_id", tenantID), - attribute.String("quota_type", quotaType), - } - - m.tenantQuotaViolations.Add(ctx, 1, metric.WithAttributes(attrs...)) -} - // RecordBuildStepStart records the start of a build step func (m *BuildMetrics) RecordBuildStepStart(ctx context.Context, stepName, sourceType string) { attrs := []attribute.KeyValue{ @@ -456,3 +424,23 @@ func (m *BuildMetrics) RecordBuildStepComplete(ctx context.Context, stepName, so m.buildStepErrors.Add(ctx, 1, metric.WithAttributes(attrs...)) } } + +// RecordBaseAssetInitRetry records a retry attempt for base asset initialization +func (m *BuildMetrics) RecordBaseAssetInitRetry(ctx context.Context, attempt int, reason string) { + attrs := []attribute.KeyValue{ + attribute.Int("attempt", attempt), + attribute.String("reason", reason), + } + + m.baseAssetInitRetries.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +// RecordBaseAssetInitFailure records a final failure of base asset initialization after all retries +func (m *BuildMetrics) RecordBaseAssetInitFailure(ctx context.Context, totalAttempts int, finalError string) { + attrs := []attribute.KeyValue{ + attribute.Int("total_attempts", totalAttempts), + attribute.String("final_error", finalError), + } + + m.baseAssetInitFailures.Add(ctx, 1, metric.WithAttributes(attrs...)) +} diff --git a/go/deploy/builderd/internal/observability/otel.go b/go/deploy/builderd/internal/observability/otel.go index 4fe2a1eae0..5855fa5f66 100644 --- a/go/deploy/builderd/internal/observability/otel.go +++ b/go/deploy/builderd/internal/observability/otel.go @@ -113,11 +113,13 @@ func InitProviders(ctx context.Context, cfg *config.Config, version string) (*Pr // initTracerProvider initializes the tracer provider func initTracerProvider(ctx context.Context, cfg *config.Config, res *resource.Resource) (trace.TracerProvider, func(context.Context) error, error) { // Create OTLP trace exporter + // AIDEV-NOTE: Export timeout must be less than shutdown timeout to prevent races + exportTimeout := 10 * time.Second // Well under the 15s shutdown timeout traceExporter, err := otlptrace.New(ctx, otlptracehttp.NewClient( otlptracehttp.WithEndpoint(cfg.OpenTelemetry.OTLPEndpoint), otlptracehttp.WithInsecure(), // For local development - otlptracehttp.WithTimeout(30*time.Second), + otlptracehttp.WithTimeout(exportTimeout), ), ) if err != nil { @@ -143,10 +145,12 @@ func initMeterProvider(ctx context.Context, cfg *config.Config, res *resource.Re var readers []sdkmetric.Reader // OTLP metric exporter + // AIDEV-NOTE: Export timeout must be less than shutdown timeout to prevent races + exportTimeout := 10 * time.Second // Well under the 15s shutdown timeout metricExporter, err := otlpmetrichttp.New(ctx, otlpmetrichttp.WithEndpoint(cfg.OpenTelemetry.OTLPEndpoint), otlpmetrichttp.WithInsecure(), // For local development - otlpmetrichttp.WithTimeout(30*time.Second), + otlpmetrichttp.WithTimeout(exportTimeout), ) if err != nil { return nil, nil, nil, fmt.Errorf("failed to create metric exporter: %w", err) diff --git a/go/deploy/builderd/internal/service/builder.go b/go/deploy/builderd/internal/service/builder.go index cf1826f4a6..665fe0efb4 100644 --- a/go/deploy/builderd/internal/service/builder.go +++ b/go/deploy/builderd/internal/service/builder.go @@ -11,12 +11,12 @@ import ( "time" "connectrpc.com/connect" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" - builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" "github.com/unkeyed/unkey/go/deploy/builderd/internal/assetmanager" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" "github.com/unkeyed/unkey/go/deploy/builderd/internal/executor" "github.com/unkeyed/unkey/go/deploy/builderd/internal/observability" + assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" "google.golang.org/protobuf/types/known/timestamppb" ) @@ -28,17 +28,9 @@ type BuilderService struct { executors *executor.Registry assetClient *assetmanager.Client - // TODO: Add these when implemented - // db *database.DB - // storage storage.Backend - // docker *docker.Client - // tenantMgr *tenant.Manager - - // AIDEV-NOTE: Temporary in-memory storage for build jobs until database is implemented builds map[string]*builderv1.BuildJob buildsMutex sync.RWMutex - // AIDEV-NOTE: Shutdown coordination to prevent races shutdownCtx context.Context shutdownCancel context.CancelFunc buildWg sync.WaitGroup @@ -54,7 +46,6 @@ func NewBuilderService( // Create executor registry executors := executor.NewRegistry(logger, cfg, buildMetrics) - // AIDEV-NOTE: Create shutdown context for coordinated service shutdown shutdownCtx, shutdownCancel := context.WithCancel(context.Background()) return &BuilderService{ @@ -434,7 +425,7 @@ func (s *BuilderService) CancelBuild( // Record cancellation metrics if s.buildMetrics != nil { - s.buildMetrics.RecordBuildCancellation(ctx, "unknown", "unknown", "unknown") + s.buildMetrics.RecordBuildCancellation(ctx, "unknown", "unknown") } resp := &builderv1.CancelBuildResponse{ diff --git a/go/deploy/builderd/internal/tenant/manager.go b/go/deploy/builderd/internal/tenant/manager.go index 485807faf2..3ed13a7e0e 100644 --- a/go/deploy/builderd/internal/tenant/manager.go +++ b/go/deploy/builderd/internal/tenant/manager.go @@ -7,8 +7,8 @@ import ( "sync" "time" - builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" "github.com/unkeyed/unkey/go/deploy/builderd/internal/config" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" ) // Manager handles tenant isolation, quotas, and resource management diff --git a/go/deploy/cleanup-unkey-deploy.sh b/go/deploy/cleanup-unkey-deploy.sh new file mode 100755 index 0000000000..a39c78b44f --- /dev/null +++ b/go/deploy/cleanup-unkey-deploy.sh @@ -0,0 +1,277 @@ +#!/bin/bash +# Cleanup script for Unkey Deploy services and components +# This script removes all installed services, configurations, and data + +set -euo pipefail + +# Color codes for output +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo "=============================================" +echo "Unkey Deploy Complete Cleanup Script" +echo "=============================================" +echo "" +echo -e "${YELLOW}WARNING: This will remove all Unkey Deploy services and data!${NC}" +echo "Services to be removed:" +echo " - metald" +echo " - builderd" +echo " - assetmanagerd" +echo " - SPIRE Server and Agent" +echo " - All VM bridges and network configurations" +echo " - All data directories" +echo "" +read -p "Are you sure you want to continue? [y/N] " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Cleanup cancelled." + exit 0 +fi + +# Check if running as root +if [ "$EUID" -ne 0 ]; then + echo -e "${RED}Error: This script must be run as root${NC}" + exit 1 +fi + +echo "" +echo "Starting cleanup process..." + +# Function to safely stop and disable a service +stop_and_disable_service() { + local service=$1 + if systemctl list-unit-files | grep -q "^${service}.service"; then + echo "Stopping and disabling ${service}..." + systemctl stop "${service}" 2>/dev/null || true + systemctl disable "${service}" 2>/dev/null || true + fi +} + +# Function to remove systemd service files +remove_service_files() { + local service=$1 + echo "Removing ${service} service files..." + rm -f "/etc/systemd/system/${service}.service" + rm -f "/etc/systemd/system/${service}@.service" + rm -f "/usr/lib/systemd/system/${service}.service" + rm -f "/usr/lib/systemd/system/${service}@.service" +} + +# 1. Stop all services +echo "" +echo "=== Stopping Services ===" +stop_and_disable_service "metald" +stop_and_disable_service "metald-bridge-8" +stop_and_disable_service "metald-bridge-32" +stop_and_disable_service "builderd" +stop_and_disable_service "assetmanagerd" +stop_and_disable_service "spire-agent" +stop_and_disable_service "spire-server" + +# 2. Kill any remaining Firecracker processes +echo "" +echo "=== Cleaning up Firecracker VMs ===" +pkill -9 firecracker 2>/dev/null || true +pkill -9 jailer 2>/dev/null || true + +# Clean up any remaining VM tap interfaces +for tap in $(ip link show | grep -o 'tap[0-9a-f_-]*' | sort -u); do + echo "Removing tap interface: $tap" + ip link delete "$tap" 2>/dev/null || true +done + +# Clean up veth interfaces +for veth in $(ip link show | grep -o 'vh_[0-9a-f]*' | sort -u); do + echo "Removing veth interface: $veth" + ip link delete "$veth" 2>/dev/null || true +done + +# 3. Remove network bridges +echo "" +echo "=== Removing Network Bridges ===" +for i in {0..31}; do + bridge="br-tenant-$i" + if ip link show "$bridge" &>/dev/null; then + echo "Removing bridge: $bridge" + ip link set "$bridge" down 2>/dev/null || true + ip link delete "$bridge" 2>/dev/null || true + fi +done + +# Remove systemd-networkd configurations +echo "Removing network configurations..." +rm -rf /etc/systemd/network/10-br-tenant-*.net{dev,work} +rm -rf /run/systemd/network/10-br-tenant-*.net{dev,work} + +# 4. Remove binaries +echo "" +echo "=== Removing Binaries ===" +binaries=( + "/usr/local/bin/metald" + "/usr/local/bin/metald-cli" + "/usr/local/bin/metald-init" + "/usr/local/bin/builderd" + "/usr/local/bin/builderd-cli" + "/usr/local/bin/assetmanagerd" + "/usr/local/bin/assetmanagerd-cli" + "/usr/local/bin/firecracker" + "/usr/local/bin/jailer" + "/opt/spire/bin/spire-server" + "/opt/spire/bin/spire-agent" + "/opt/spire/bin/spire" +) + +for binary in "${binaries[@]}"; do + if [ -f "$binary" ]; then + echo "Removing: $binary" + rm -f "$binary" + fi +done + +# 5. Remove service files +echo "" +echo "=== Removing Service Files ===" +remove_service_files "metald" +remove_service_files "metald-bridge-8" +remove_service_files "metald-bridge-32" +remove_service_files "builderd" +remove_service_files "assetmanagerd" +remove_service_files "spire-server" +remove_service_files "spire-agent" + +# 6. Remove configuration files +echo "" +echo "=== Removing Configuration Files ===" +rm -rf /etc/metald +rm -rf /etc/builderd +rm -rf /etc/assetmanagerd +rm -rf /etc/spire +rm -rf /etc/default/unkey-deploy +rm -f /etc/default/metald +rm -f /etc/default/builderd +rm -f /etc/default/assetmanagerd + +# 7. Remove data directories +echo "" +echo "=== Removing Data Directories ===" +echo -e "${YELLOW}Warning: This will delete all VM images and assets!${NC}" +read -p "Remove all data directories? [y/N] " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]]; then + # Service data directories + rm -rf /opt/metald + rm -rf /opt/builderd + rm -rf /opt/assetmanagerd + rm -rf /opt/vm-assets + rm -rf /opt/spire + + # Runtime directories + rm -rf /var/lib/metald + rm -rf /var/lib/builderd + rm -rf /var/lib/assetmanagerd + rm -rf /var/lib/spire + rm -rf /var/lib/firecracker + + # Jailer directories + rm -rf /srv/jailer + rm -rf /var/run/firecracker + + # Log directories + rm -rf /var/log/metald + rm -rf /var/log/builderd + rm -rf /var/log/assetmanagerd + rm -rf /var/log/spire + + echo -e "${GREEN}✓${NC} Data directories removed" +else + echo "Skipping data directory removal" +fi + +# 8. Remove users and groups +echo "" +echo "=== Removing Service Users ===" +for user in metald builderd assetmanagerd firecracker spire; do + if id -u "$user" &>/dev/null; then + echo "Removing user: $user" + userdel "$user" 2>/dev/null || true + fi + if getent group "$user" &>/dev/null; then + echo "Removing group: $user" + groupdel "$user" 2>/dev/null || true + fi +done + +# 9. Clean up iptables rules +echo "" +echo "=== Cleaning up iptables rules ===" +# Remove FORWARD rules for VM bridges +for i in {0..31}; do + iptables -D FORWARD -i br-tenant-$i -j ACCEPT 2>/dev/null || true + iptables -D FORWARD -o br-tenant-$i -j ACCEPT 2>/dev/null || true +done + +# Remove NAT rules +iptables -t nat -F 2>/dev/null || true +iptables -t nat -X 2>/dev/null || true + +# 10. Clean up cgroups +echo "" +echo "=== Cleaning up cgroups ===" +if [ -d /sys/fs/cgroup/firecracker ]; then + rmdir /sys/fs/cgroup/firecracker 2>/dev/null || true +fi + +# Clean up any VM-specific cgroups +for cg in $(find /sys/fs/cgroup -name "*firecracker*" -type d 2>/dev/null); do + rmdir "$cg" 2>/dev/null || true +done + +# 11. Reload systemd +echo "" +echo "=== Reloading systemd ===" +systemctl daemon-reload +systemctl restart systemd-networkd + +# 12. Clean up any remaining artifacts +echo "" +echo "=== Final cleanup ===" +# Remove any temporary VM files +rm -rf /tmp/firecracker-* +rm -rf /tmp/vm-* +rm -f /tmp/*-vm-console.log + +# Remove any socket files +rm -f /var/run/firecracker.sock* +rm -f /var/run/metald.sock +rm -f /var/run/builderd.sock +rm -f /var/run/assetmanagerd.sock +rm -f /var/lib/spire/agent/agent.sock + +# Clean up any remaining systemd runtime directories +rm -rf /run/systemd/system/metald.service.d +rm -rf /run/systemd/system/builderd.service.d +rm -rf /run/systemd/system/assetmanagerd.service.d + +echo "" +echo "=============================================" +echo -e "${GREEN}✓ Cleanup completed successfully!${NC}" +echo "=============================================" +echo "" +echo "The following have been removed:" +echo " - All Unkey Deploy services and binaries" +echo " - All network bridges and configurations" +echo " - All service users and groups" +echo " - All configuration files" +if [[ $REPLY =~ ^[Yy]$ ]]; then + echo " - All data directories and VM assets" +fi +echo "" +echo "System has been restored to pre-installation state." +echo "" +echo "Note: If you want to reinstall, you'll need to:" +echo " 1. Reinstall SPIRE Server and Agent" +echo " 2. Reinstall and configure all services" +echo " 3. Re-run network bridge setup" +echo " 4. Re-download base VM assets" diff --git a/go/deploy/metald/.gitignore b/go/deploy/metald/.gitignore index 81b5230d36..f40086b700 100644 --- a/go/deploy/metald/.gitignore +++ b/go/deploy/metald/.gitignore @@ -85,3 +85,5 @@ storage/ scratch/ rootfs/ workspace/ + +sqlc/networks-seed.sql diff --git a/go/deploy/metald/Makefile b/go/deploy/metald/Makefile index 353adb65ec..421330390c 100644 --- a/go/deploy/metald/Makefile +++ b/go/deploy/metald/Makefile @@ -8,18 +8,18 @@ BUILD_DIR := build VERSION ?= 0.5.2 GOOS ?= $(shell go env GOOS) GOARCH ?= $(shell go env GOARCH) -LDFLAGS := -ldflags "-s -w -X main.version=$(VERSION)" +LDFLAGS := -ldflags "-s=false -w=false -X main.version=$(VERSION)" # Colors for output CYAN := \033[36m RESET := \033[0m # Targets (alphabetically ordered) -.PHONY: build build-linux check ci clean debug deps dev fmt health help install install-tools lint metrics release run service-logs service-logs-full service-restart service-start service-status service-stop setup test test-coverage test-short uninstall version vet +.PHONY: build build-linux check ci clean deps dev fmt help install install-bridge-8 install-bridge-32 lint release run service-logs service-logs-full service-restart service-start service-status service-stop test test-coverage test-short uninstall uninstall-bridge-systemd version vet apply-bridge-8-config apply-bridge-32-config build: deps ## Build the binary @mkdir -p $(BUILD_DIR) - @go build $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY_NAME) ./cmd/metald + go build $(LDFLAGS) -gcflags="all=-N -l" -o $(BUILD_DIR)/$(BINARY_NAME) ./cmd/metald build-linux: ## Build Linux binary for deployment @mkdir -p $(BUILD_DIR) @@ -33,49 +33,30 @@ clean: ## Clean build artifacts @rm -rf $(BUILD_DIR) @rm -f coverage.out coverage.html - -debug: build ## Run with debug logging - @UNKEY_METALD_OTEL_ENABLED=true ./$(BUILD_DIR)/$(BINARY_NAME) - deps: ## Download and tidy dependencies @go mod download @go mod tidy -dev: ## Run the service in development mode - @go run ./cmd/metald - fmt: ## Format Go code @goimports -w . - - -health: ## Check service health - @curl -s http://localhost:8080/_/health | jq . || echo "Health check failed" + @gofumpt -w . help: ## Display this help message - @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make $(CYAN)$(RESET)\n"} /^[a-zA-Z_-]+:.*?##/ { printf " $(CYAN)%-20s$(RESET) %s\n", $$1, $$2 } /^##@/ { printf "\n%s\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make $(CYAN)$(RESET)\n"} /^[a-zA-Z0-9_-]+:.*##/ { printf " $(CYAN)%-20s$(RESET) %s\n", $$1, $$2 } /^##@/ { printf "\n%s\n", substr($$0, 5) } ' $(MAKEFILE_LIST) -# AIDEV-NOTE: The install target sets up environment configuration by copying metald.env.example -# to /etc/metald/metald.env on first install. The systemd service uses EnvironmentFile to load -# these settings, allowing easy configuration management without modifying the service file. install: build ## Install metald binary and systemd service @sudo systemctl stop metald 2>/dev/null || true @sudo cp $(BUILD_DIR)/$(BINARY_NAME) /usr/local/bin/$(BINARY_NAME) @sudo chmod +x /usr/local/bin/$(BINARY_NAME) @sudo cp contrib/systemd/metald.service /etc/systemd/system/metald.service - @echo "d /run/netns 0755 root root -" | sudo tee /etc/tmpfiles.d/metald-netns.conf >/dev/null @sudo systemctl daemon-reload @sudo systemctl start metald 2>/dev/null || true @echo "✓ metald installed and started" - lint: ## Run linting tools @which golangci-lint >/dev/null || (echo "golangci-lint not found, install from https://golangci-lint.run/usage/install/" && exit 1) @golangci-lint run --disable=godox -metrics: ## Check Prometheus metrics - @curl -s http://localhost:9464/metrics | grep -E "^(vm_|process_|jailer_)" || echo "No VM metrics found" - - release: clean ci build-linux ## Prepare release build @echo "✓ Release build: $(BUILD_DIR)/$(BINARY_NAME)-linux" @@ -103,10 +84,8 @@ service-stop: ## Stop metald service @sudo systemctl stop metald @echo "✓ metald stopped" -setup: deps ## Complete development setup - test: ## Run all tests - @go test ./... -v + @go test -json -failfast -v ./... | tparse test-coverage: ## Run tests with coverage report @go test ./... -coverprofile=coverage.out @@ -129,4 +108,4 @@ version: ## Show version information @echo "$(BINARY_NAME) version: $(VERSION)" vet: ## Run go vet - @go vet ./... \ No newline at end of file + @go vet ./... diff --git a/go/deploy/metald/README.md b/go/deploy/metald/README.md index aed48c9b68..5e267b45e1 100644 --- a/go/deploy/metald/README.md +++ b/go/deploy/metald/README.md @@ -77,13 +77,9 @@ Metald integrates with other Unkey Deploy services: ## Security -Metald uses an integrated jailer approach with specific capabilities: -- `CAP_SYS_ADMIN` - Namespace operations -- `CAP_NET_ADMIN` - Network device creation -- `CAP_SYS_CHROOT` - Jail creation -- Additional capabilities for privilege dropping +Metald runs as root to manage network namespaces, interfaces, and iptables operations. This is acceptable as metald is designed to be the sole application on dedicated VM hosts. The integrated jailer still drops privileges to specified UID/GID for individual VM processes, ensuring proper isolation. -The `make install` command configures these automatically. +The `make install` command configures the service with appropriate permissions automatically. ## Contributing diff --git a/go/deploy/metald/client/client.go b/go/deploy/metald/client/client.go index 0d371a3a09..030a10475e 100644 --- a/go/deploy/metald/client/client.go +++ b/go/deploy/metald/client/client.go @@ -12,7 +12,6 @@ import ( "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1/vmprovisionerv1connect" ) -// AIDEV-NOTE: Metald client with SPIFFE/SPIRE socket integration and tenant isolation // This client provides a high-level interface for metald VM operations with proper authentication // Config holds the configuration for the metald client @@ -26,6 +25,12 @@ type Config struct { // TenantID is the tenant identifier for data scoping TenantID string + // ProjectID identifies the tenants project + ProjectID string + + // EnvironmentID identifies the environment within the project + EnvironmentID string + // TLS configuration TLSMode string // "disabled", "file", or "spiffe" SPIFFESocketPath string // Path to SPIFFE agent socket @@ -41,11 +46,12 @@ type Config struct { // Client provides a high-level interface to metald services type Client struct { - vmService vmprovisionerv1connect.VmServiceClient - tlsProvider tls.Provider - userID string - tenantID string - serverAddr string + vmService vmprovisionerv1connect.VmServiceClient + tlsProvider tls.Provider + tenantID string + projectID string + environmentID string + serverAddr string } // New creates a new metald client with SPIFFE/SPIRE integration @@ -86,9 +92,10 @@ func New(ctx context.Context, config Config) (*Client, error) { // Add authentication and tenant isolation transport httpClient.Transport = &tenantTransport{ - Base: httpClient.Transport, - UserID: config.UserID, - TenantID: config.TenantID, + Base: httpClient.Transport, + ProjectID: config.ProjectID, + TenantID: config.TenantID, + EnvironmentID: config.EnvironmentID, } // Create ConnectRPC client @@ -98,11 +105,12 @@ func New(ctx context.Context, config Config) (*Client, error) { ) return &Client{ - vmService: vmService, - tlsProvider: tlsProvider, - userID: config.UserID, - tenantID: config.TenantID, - serverAddr: config.ServerAddress, + vmService: vmService, + tlsProvider: tlsProvider, + tenantID: config.TenantID, + projectID: config.ProjectID, + environmentID: config.EnvironmentID, + serverAddr: config.ServerAddress, }, nil } @@ -118,9 +126,9 @@ func (c *Client) Close() error { func (c *Client) CreateVM(ctx context.Context, req *CreateVMRequest) (*CreateVMResponse, error) { // Convert to protobuf request pbReq := &vmprovisionerv1.CreateVmRequest{ - VmId: req.VMID, - Config: req.Config, - CustomerId: c.userID, + VmId: req.VMID, + Config: req.Config, + TenantId: c.tenantID, } resp, err := c.vmService.CreateVm(ctx, connect.NewRequest(pbReq)) @@ -290,9 +298,10 @@ func (c *Client) GetServerAddress() string { // tenantTransport adds authentication and tenant isolation headers to all requests type tenantTransport struct { - Base http.RoundTripper - UserID string - TenantID string + Base http.RoundTripper + EnvironmentID string + ProjectID string + TenantID string } func (t *tenantTransport) RoundTrip(req *http.Request) (*http.Response, error) { @@ -305,10 +314,12 @@ func (t *tenantTransport) RoundTrip(req *http.Request) (*http.Response, error) { // Set Authorization header with development token format // AIDEV-BUSINESS_RULE: In development, use "dev_user_" format // TODO: Update to proper JWT tokens in production - req2.Header.Set("Authorization", fmt.Sprintf("Bearer dev_user_%s", t.UserID)) + req2.Header.Set("Authorization", fmt.Sprintf("Bearer dev_user_%s", t.TenantID)) // Also set X-Tenant-ID header for tenant identification req2.Header.Set("X-Tenant-ID", t.TenantID) + req2.Header.Set("X-Project-ID", t.ProjectID) + req2.Header.Set("X-Environment-ID", t.EnvironmentID) // Use the base transport, or default if nil base := t.Base diff --git a/go/deploy/metald/client/cmd/metald-cli/main.go b/go/deploy/metald/client/cmd/metald-cli/main.go index 0fccb27f18..f63895fbd2 100644 --- a/go/deploy/metald/client/cmd/metald-cli/main.go +++ b/go/deploy/metald/client/cmd/metald-cli/main.go @@ -13,21 +13,20 @@ import ( vmprovisionerv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" ) -// AIDEV-NOTE: CLI tool demonstrating metald client usage with SPIFFE integration -// This provides a command-line interface for VM operations with proper tenant isolation - func main() { var ( - serverAddr = flag.String("server", getEnvOrDefault("UNKEY_METALD_SERVER_ADDRESS", "https://localhost:8080"), "metald server address") - userID = flag.String("user", getEnvOrDefault("UNKEY_METALD_USER_ID", "cli-user"), "user ID for authentication") - tenantID = flag.String("tenant", getEnvOrDefault("UNKEY_METALD_TENANT_ID", "cli-tenant"), "tenant ID for data scoping") - tlsMode = flag.String("tls-mode", getEnvOrDefault("UNKEY_METALD_TLS_MODE", "spiffe"), "TLS mode: disabled, file, or spiffe") - spiffeSocket = flag.String("spiffe-socket", getEnvOrDefault("UNKEY_METALD_SPIFFE_SOCKET", "/var/lib/spire/agent/agent.sock"), "SPIFFE agent socket path") - tlsCert = flag.String("tls-cert", "", "TLS certificate file (for file mode)") - tlsKey = flag.String("tls-key", "", "TLS key file (for file mode)") - tlsCA = flag.String("tls-ca", "", "TLS CA file (for file mode)") - timeout = flag.Duration("timeout", 30*time.Second, "request timeout") - jsonOutput = flag.Bool("json", false, "output results as JSON") + serverAddr = flag.String("server", getEnvOrDefault("UNKEY_METALD_SERVER_ADDRESS", "https://localhost:8080"), "metald server address") + userID = flag.String("user", getEnvOrDefault("UNKEY_METALD_USER_ID", "cli-user"), "user ID for authentication") + tenantID = flag.String("tenant", getEnvOrDefault("UNKEY_METALD_TENANT_ID", "cli-tenant"), "tenant ID for data scoping") + projectID = flag.String("project-id", getEnvOrDefault("UNKEY_METALD_PROJECT_ID", "metald-cli-test"), "project ID for data scoping") + environmentID = flag.String("environment-id", getEnvOrDefault("UNKEY_METALD_ENVIRONMENT_ID", "development"), "environment ID for data scoping") + tlsMode = flag.String("tls-mode", getEnvOrDefault("UNKEY_METALD_TLS_MODE", "spiffe"), "TLS mode: disabled, file, or spiffe") + spiffeSocket = flag.String("spiffe-socket", getEnvOrDefault("UNKEY_METALD_SPIFFE_SOCKET", "/var/lib/spire/agent/agent.sock"), "SPIFFE agent socket path") + tlsCert = flag.String("tls-cert", "", "TLS certificate file (for file mode)") + tlsKey = flag.String("tls-key", "", "TLS key file (for file mode)") + tlsCA = flag.String("tls-ca", "", "TLS CA file (for file mode)") + timeout = flag.Duration("timeout", 30*time.Second, "request timeout") + jsonOutput = flag.Bool("json", false, "output results as JSON") // VM configuration options configFile = flag.String("config", "", "path to VM configuration file (JSON)") @@ -51,6 +50,8 @@ func main() { ServerAddress: *serverAddr, UserID: *userID, TenantID: *tenantID, + ProjectID: *projectID, + EnvironmentID: *environmentID, TLSMode: *tlsMode, SPIFFESocketPath: *spiffeSocket, TLSCertFile: *tlsCert, @@ -192,13 +193,7 @@ func createVMConfig(options VMConfigOptions) (*vmprovisionerv1.VmConfig, error) return configFile.ToVMConfig() } - // Start with template - templateName := options.Template - if templateName == "" { - templateName = "standard" - } - template := client.VMTemplate(templateName) - builder := client.NewVMConfigFromTemplate(template) + builder := client.NewVMConfigBuilder() // Apply Docker image configuration if specified if options.DockerImage != "" { @@ -275,15 +270,32 @@ func handleCreate(ctx context.Context, metaldClient *client.Client, options VMCo log.Fatalf("Failed to create VM: %v", err) } + // Fetch VM info to get IP address + var ipAddress string + vmInfo, err := metaldClient.GetVMInfo(ctx, resp.VMID) + if err != nil { + // Don't fail on network info error, just log it + fmt.Fprintf(os.Stderr, "Warning: Could not fetch IP address: %v\n", err) + } else if vmInfo.NetworkInfo != nil { + ipAddress = vmInfo.NetworkInfo.IpAddress + } + if jsonOutput { - outputJSON(map[string]any{ + result := map[string]any{ "vm_id": resp.VMID, "state": resp.State.String(), - }) + } + if ipAddress != "" { + result["ip_address"] = ipAddress + } + outputJSON(result) } else { fmt.Printf("VM created successfully:\n") fmt.Printf(" VM ID: %s\n", resp.VMID) fmt.Printf(" State: %s\n", resp.State.String()) + if ipAddress != "" { + fmt.Printf(" IP Address: %s\n", ipAddress) + } } } @@ -577,19 +589,39 @@ func handleCreateAndBoot(ctx context.Context, metaldClient *client.Client, optio log.Fatalf("Failed to boot VM: %v", err) } + // Wait a moment for VM to boot and get IP address + time.Sleep(3 * time.Second) + + // Fetch VM info to get IP address + var ipAddress string + vmInfo, err := metaldClient.GetVMInfo(ctx, createResp.VMID) + if err != nil { + // Don't fail on network info error, just log it + fmt.Fprintf(os.Stderr, "Warning: Could not fetch IP address: %v\n", err) + } else if vmInfo.NetworkInfo != nil { + ipAddress = vmInfo.NetworkInfo.IpAddress + } + if jsonOutput { - outputJSON(map[string]any{ + result := map[string]any{ "vm_id": createResp.VMID, "create_state": createResp.State.String(), "boot_success": bootResp.Success, "boot_state": bootResp.State.String(), - }) + } + if ipAddress != "" { + result["ip_address"] = ipAddress + } + outputJSON(result) } else { fmt.Printf("VM created and booted successfully:\n") fmt.Printf(" VM ID: %s\n", createResp.VMID) fmt.Printf(" Create State: %s\n", createResp.State.String()) fmt.Printf(" Boot Success: %v\n", bootResp.Success) fmt.Printf(" Boot State: %s\n", bootResp.State.String()) + if ipAddress != "" { + fmt.Printf(" IP Address: %s\n", ipAddress) + } } } diff --git a/go/deploy/metald/client/config.go b/go/deploy/metald/client/config.go index e244007f55..5fdb244c66 100644 --- a/go/deploy/metald/client/config.go +++ b/go/deploy/metald/client/config.go @@ -4,14 +4,10 @@ import ( "encoding/json" "fmt" "os" - "path/filepath" vmprovisionerv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" ) -// AIDEV-NOTE: Configuration file support for VM templates and custom configurations -// This allows users to define VM configurations in JSON/YAML files for reuse - // VMConfigFile represents a VM configuration that can be loaded from/saved to a file type VMConfigFile struct { // Name is a human-readable name for this configuration @@ -123,37 +119,11 @@ func LoadVMConfigFromFile(filename string) (*VMConfigFile, error) { return &config, nil } -// SaveVMConfigToFile saves a VM configuration to a JSON file -func SaveVMConfigToFile(config *VMConfigFile, filename string) error { - // Create directory if it doesn't exist - dir := filepath.Dir(filename) - if err := os.MkdirAll(dir, 0755); err != nil { - return fmt.Errorf("failed to create directory %s: %w", dir, err) - } - - data, err := json.MarshalIndent(config, "", " ") - if err != nil { - return fmt.Errorf("failed to marshal config: %w", err) - } - - if err := os.WriteFile(filename, data, 0644); err != nil { - return fmt.Errorf("failed to write config file %s: %w", filename, err) - } - - return nil -} - // ToVMConfig converts a VMConfigFile to a protobuf VmConfig func (c *VMConfigFile) ToVMConfig() (*vmprovisionerv1.VmConfig, error) { var builder *VMConfigBuilder - // Start with template if specified - if c.Template != "" { - template := VMTemplate(c.Template) - builder = NewVMConfigFromTemplate(template) - } else { - builder = NewVMConfigBuilder() - } + builder = NewVMConfigBuilder() // Override with specific configuration builder.WithCPU(c.CPU.VCPUCount, c.CPU.MaxVCPUCount) @@ -323,42 +293,3 @@ func formatNetworkMode(mode vmprovisionerv1.NetworkMode) string { return "dual_stack" } } - -// GetBuiltinConfigPath returns the path to a built-in configuration file -func GetBuiltinConfigPath(name string) string { - return filepath.Join("configs", fmt.Sprintf("%s.json", name)) -} - -// CreateBuiltinConfigs creates built-in configuration files for common templates -func CreateBuiltinConfigs(configDir string) error { - templates := map[string]VMTemplate{ - "minimal": TemplateMinimal, - "standard": TemplateStandard, - "high-cpu": TemplateHighCPU, - "high-memory": TemplateHighMemory, - "development": TemplateDevelopment, - } - - descriptions := map[string]string{ - "minimal": "Minimal VM configuration with basic resources for lightweight workloads", - "standard": "Standard VM configuration with balanced CPU and memory for general workloads", - "high-cpu": "High-CPU VM configuration optimized for compute-intensive workloads", - "high-memory": "High-memory VM configuration optimized for memory-intensive workloads", - "development": "Development VM configuration with extra resources and development tools", - } - - for name, template := range templates { - builder := NewVMConfigFromTemplate(template) - config := builder.Build() - - configFile := FromVMConfig(config, name, descriptions[name]) - configFile.Template = string(template) - - filename := filepath.Join(configDir, fmt.Sprintf("%s.json", name)) - if err := SaveVMConfigToFile(configFile, filename); err != nil { - return fmt.Errorf("failed to create config file %s: %w", filename, err) - } - } - - return nil -} diff --git a/go/deploy/metald/client/example_test.go b/go/deploy/metald/client/example_test.go deleted file mode 100644 index 5320e1ec8e..0000000000 --- a/go/deploy/metald/client/example_test.go +++ /dev/null @@ -1,277 +0,0 @@ -package client_test - -import ( - "context" - "fmt" - "log" - "time" - - "github.com/unkeyed/unkey/go/deploy/metald/client" - vmprovisionerv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" -) - -// AIDEV-NOTE: Example demonstrating metald client usage with SPIFFE authentication -// This shows the complete VM lifecycle using the high-level client interface - -func ExampleClient_CreateAndBootVM() { - ctx := context.Background() - - // Create client with SPIFFE authentication - config := client.Config{ - ServerAddress: "https://metald:8080", - TenantID: "example-customer-123", - TLSMode: "spiffe", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - Timeout: 30 * time.Second, - } - - metaldClient, err := client.New(ctx, config) - if err != nil { - log.Fatalf("Failed to create metald client: %v", err) - } - defer metaldClient.Close() - - // Create VM configuration - vmConfig := &vmprovisionerv1.VmConfig{ - Cpu: &vmprovisionerv1.CpuConfig{ - VcpuCount: 2, - MaxVcpuCount: 4, - }, - Memory: &vmprovisionerv1.MemoryConfig{ - SizeBytes: 1 * 1024 * 1024 * 1024, // 1GB - HotplugEnabled: true, - MaxSizeBytes: 4 * 1024 * 1024 * 1024, // 4GB max - }, - Boot: &vmprovisionerv1.BootConfig{ - KernelPath: "/opt/vm-assets/vmlinux", - KernelArgs: "console=ttyS0 reboot=k panic=1 pci=off", - }, - Storage: []*vmprovisionerv1.StorageDevice{ - { - Id: "rootfs", - Path: "/opt/vm-assets/rootfs.ext4", - ReadOnly: false, - IsRootDevice: true, - InterfaceType: "virtio-blk", - }, - }, - Network: []*vmprovisionerv1.NetworkInterface{ - { - Id: "eth0", - InterfaceType: "virtio-net", - Mode: vmprovisionerv1.NetworkMode_NETWORK_MODE_DUAL_STACK, - Ipv4Config: &vmprovisionerv1.IPv4Config{ - Dhcp: true, - }, - Ipv6Config: &vmprovisionerv1.IPv6Config{ - Slaac: true, - PrivacyExtensions: true, - }, - }, - }, - Console: &vmprovisionerv1.ConsoleConfig{ - Enabled: true, - Output: "/tmp/vm-console.log", - ConsoleType: "serial", - }, - Metadata: map[string]string{ - "purpose": "example", - "environment": "development", - "tenant": config.TenantID, - }, - } - - // Create the VM - createReq := &client.CreateVMRequest{ - VMID: "", // Let metald generate a VM ID - Config: vmConfig, - } - - createResp, err := metaldClient.CreateVM(ctx, createReq) - if err != nil { - log.Fatalf("Failed to create VM: %v", err) - } - - fmt.Printf("VM created: %s (state: %s)\n", createResp.VMID, createResp.State) - - // Boot the VM - bootResp, err := metaldClient.BootVM(ctx, createResp.VMID) - if err != nil { - log.Fatalf("Failed to boot VM: %v", err) - } - - fmt.Printf("VM booted: success=%v (state: %s)\n", bootResp.Success, bootResp.State) - - // Get VM info - vmInfo, err := metaldClient.GetVMInfo(ctx, createResp.VMID) - if err != nil { - log.Fatalf("Failed to get VM info: %v", err) - } - - fmt.Printf("VM info: %s (state: %s)\n", vmInfo.VMID, vmInfo.State) - if vmInfo.Config != nil { - fmt.Printf(" CPUs: %d, Memory: %d MB\n", - vmInfo.Config.Cpu.VcpuCount, - vmInfo.Config.Memory.SizeBytes/(1024*1024)) - } - - // List all VMs - listReq := &client.ListVMsRequest{ - PageSize: 10, - } - - listResp, err := metaldClient.ListVMs(ctx, listReq) - if err != nil { - log.Fatalf("Failed to list VMs: %v", err) - } - - fmt.Printf("Total VMs for tenant %s: %d\n", config.TenantID, listResp.TotalCount) - - // Shutdown the VM - shutdownReq := &client.ShutdownVMRequest{ - VMID: createResp.VMID, - Force: false, - TimeoutSeconds: 30, - } - - shutdownResp, err := metaldClient.ShutdownVM(ctx, shutdownReq) - if err != nil { - log.Fatalf("Failed to shutdown VM: %v", err) - } - - fmt.Printf("VM shutdown: success=%v (state: %s)\n", shutdownResp.Success, shutdownResp.State) - - // Output: - // VM created: vm-123 (state: VM_STATE_CREATED) - // VM booted: success=true (state: VM_STATE_RUNNING) - // VM info: vm-123 (state: VM_STATE_RUNNING) - // CPUs: 2, Memory: 1024 MB - // Total VMs for customer example-customer-123: 1 - // VM shutdown: success=true (state: VM_STATE_SHUTDOWN) -} - -func ExampleClient_WithTLSModes() { - ctx := context.Background() - - // Example 1: SPIFFE mode (production default) - spiffeConfig := client.Config{ - ServerAddress: "https://metald:8080", - TenantID: "prod-customer", - TLSMode: "spiffe", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - } - - spiffeClient, err := client.New(ctx, spiffeConfig) - if err != nil { - log.Printf("SPIFFE client error: %v", err) - } else { - defer spiffeClient.Close() - fmt.Printf("SPIFFE client created for customer: %s\n", spiffeClient.GetTenantID()) - } - - // Example 2: File-based TLS mode - fileConfig := client.Config{ - ServerAddress: "https://metald:8080", - TenantID: "test-customer", - TLSMode: "file", - TLSCertFile: "/etc/ssl/certs/client.crt", - TLSKeyFile: "/etc/ssl/private/client.key", - TLSCAFile: "/etc/ssl/certs/ca.crt", - } - - fileClient, err := client.New(ctx, fileConfig) - if err != nil { - log.Printf("File TLS client error: %v", err) - } else { - defer fileClient.Close() - fmt.Printf("File TLS client created for customer: %s\n", fileClient.GetTenantID()) - } - - // Example 3: Disabled TLS mode (development only) - devConfig := client.Config{ - ServerAddress: "http://localhost:8080", - TenantID: "dev-customer", - TLSMode: "disabled", - } - - devClient, err := client.New(ctx, devConfig) - if err != nil { - log.Printf("Development client error: %v", err) - } else { - defer devClient.Close() - fmt.Printf("Development client created for customer: %s\n", devClient.GetTenantID()) - } - - // Output: - // SPIFFE client created for customer: prod-customer - // File TLS client created for customer: test-customer - // Development client created for customer: dev-customer -} - -func ExampleClient_VMLifecycleOperations() { - ctx := context.Background() - - config := client.Config{ - ServerAddress: "https://metald:8080", - TenantID: "lifecycle-demo", - TLSMode: "spiffe", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - } - - metaldClient, err := client.New(ctx, config) - if err != nil { - log.Fatalf("Failed to create client: %v", err) - } - defer metaldClient.Close() - - // Assume we have a VM ID from previous creation - vmID := "vm-lifecycle-example" - - // Pause VM - pauseResp, err := metaldClient.PauseVM(ctx, vmID) - if err != nil { - log.Printf("Pause failed: %v", err) - } else { - fmt.Printf("VM paused: success=%v (state: %s)\n", pauseResp.Success, pauseResp.State) - } - - // Resume VM - resumeResp, err := metaldClient.ResumeVM(ctx, vmID) - if err != nil { - log.Printf("Resume failed: %v", err) - } else { - fmt.Printf("VM resumed: success=%v (state: %s)\n", resumeResp.Success, resumeResp.State) - } - - // Reboot VM - rebootReq := &client.RebootVMRequest{ - VMID: vmID, - Force: false, - } - - rebootResp, err := metaldClient.RebootVM(ctx, rebootReq) - if err != nil { - log.Printf("Reboot failed: %v", err) - } else { - fmt.Printf("VM rebooted: success=%v (state: %s)\n", rebootResp.Success, rebootResp.State) - } - - // Delete VM - deleteReq := &client.DeleteVMRequest{ - VMID: vmID, - Force: false, - } - - deleteResp, err := metaldClient.DeleteVM(ctx, deleteReq) - if err != nil { - log.Printf("Delete failed: %v", err) - } else { - fmt.Printf("VM deleted: success=%v\n", deleteResp.Success) - } - - // Output: - // VM paused: success=true (state: VM_STATE_PAUSED) - // VM resumed: success=true (state: VM_STATE_RUNNING) - // VM rebooted: success=true (state: VM_STATE_RUNNING) - // VM deleted: success=true -} diff --git a/go/deploy/metald/client/examples/README.md b/go/deploy/metald/client/examples/README.md deleted file mode 100644 index dad8fe2e57..0000000000 --- a/go/deploy/metald/client/examples/README.md +++ /dev/null @@ -1,202 +0,0 @@ -# Metald Client Configuration Examples - -This directory contains example VM configurations demonstrating different use cases and scenarios. - -## Configuration Files - -### `minimal.json` -- **Purpose**: Lightweight VM for basic tasks -- **Resources**: 1 vCPU, 512MB RAM -- **Use Cases**: Simple services, testing, CI/CD agents - -```bash -# Create minimal VM -metald-cli -config=examples/configs/minimal.json create-and-boot -``` - -### `web-server.json` -- **Purpose**: High-performance web server -- **Resources**: 8 vCPUs, 8GB RAM (scalable to 16 vCPUs, 32GB RAM) -- **Features**: NGINX with Docker support, separate log storage -- **Use Cases**: Production web servers, load balancers, API gateways - -```bash -# Create web server VM -metald-cli -config=examples/configs/web-server.json create-and-boot web-01 -``` - -### `database.json` -- **Purpose**: High-memory database server -- **Resources**: 8 vCPUs, 32GB RAM (scalable to 16 vCPUs, 128GB RAM) -- **Features**: PostgreSQL with separate data, log, and backup storage -- **Use Cases**: Primary databases, data warehouses, analytics engines - -```bash -# Create database server -metald-cli -config=examples/configs/database.json create-and-boot db-primary -``` - -### `development.json` -- **Purpose**: Development environment with tools -- **Resources**: 6 vCPUs, 16GB RAM (scalable to 12 vCPUs, 64GB RAM) -- **Features**: Ubuntu with development tools, Docker, workspace storage -- **Use Cases**: Developer workspaces, build environments, testing - -```bash -# Create development environment -metald-cli -config=examples/configs/development.json create-and-boot dev-env -``` - -## Customizing Configurations - -### 1. Template-Based Approach -Start with a built-in template and customize: - -```bash -# Generate base configuration -metald-cli -template=standard config-gen > my-config.json - -# Edit the configuration file -vim my-config.json - -# Use the custom configuration -metald-cli -config=my-config.json create-and-boot -``` - -### 2. Override Parameters -Use CLI flags to override specific configuration: - -```bash -# Use config file but override CPU and memory -metald-cli -config=web-server.json -cpu=16 -memory=65536 create-and-boot -``` - -### 3. Docker Image Integration -Configure VMs for specific Docker images: - -```bash -# Create VM for specific Docker image -metald-cli -docker-image=redis:alpine -template=high-memory create-and-boot redis-cache -``` - -## Configuration Validation - -Always validate configurations before use: - -```bash -# Validate configuration file -metald-cli config-validate examples/configs/web-server.json - -# Output validation results as JSON -metald-cli config-validate examples/configs/database.json -json -``` - -## Common Configuration Patterns - -### High Availability Setup -```bash -# Create multiple web servers -for i in {1..3}; do - metald-cli -config=examples/configs/web-server.json create-and-boot web-$i -done - -# Create database primary and replica -metald-cli -config=examples/configs/database.json create-and-boot db-primary -metald-cli -config=examples/configs/database.json create-and-boot db-replica -``` - -### Development Team Setup -```bash -# Create development environments for team -for dev in alice bob charlie; do - metald-cli -config=examples/configs/development.json create-and-boot dev-$dev -done -``` - -### Microservices Deployment -```bash -# Create VMs for different services -metald-cli -docker-image=my-api:latest -template=standard create-and-boot api-service -metald-cli -docker-image=my-worker:latest -template=high-cpu create-and-boot worker-service -metald-cli -config=examples/configs/database.json create-and-boot db-service -metald-cli -config=examples/configs/web-server.json create-and-boot proxy-service -``` - -## Best Practices - -### Resource Planning -1. **Start Small**: Begin with minimal resources and scale up -2. **Enable Hotplug**: Allow memory and CPU scaling without downtime -3. **Separate Storage**: Use dedicated storage for data, logs, and backups -4. **Monitor Usage**: Track actual resource utilization - -### Security Configuration -1. **Network Isolation**: Use appropriate network modes (IPv4-only for internal services) -2. **Static IPs**: Configure static IPs for database and backend services -3. **Metadata**: Include environment and team information for auditing -4. **Console Logging**: Enable console output for debugging - -### Storage Configuration -1. **Root Filesystem**: Use appropriate size for OS and applications -2. **Data Storage**: Separate data storage from OS storage -3. **Log Storage**: Dedicated storage for logs to prevent disk space issues -4. **Backup Storage**: Include backup storage for critical services - -### Metadata Best Practices -Include comprehensive metadata for operations: - -```json -{ - "metadata": { - "purpose": "web-server", - "environment": "production", - "team": "platform", - "service": "nginx", - "version": "1.21.6", - "scaling_group": "web-tier", - "backup_enabled": "true", - "monitoring": "enabled", - "created_by": "deployment-system", - "cost_center": "engineering" - } -} -``` - -### Network Configuration -Choose appropriate network modes: -- **dual_stack**: Most services (IPv4 + IPv6) -- **ipv4_only**: Internal services, databases -- **ipv6_only**: IPv6-only environments - -## Troubleshooting - -### Configuration Validation Errors -```bash -# Check for common issues -metald-cli config-validate my-config.json - -# Common problems: -# - Missing root storage device -# - Invalid CPU/memory ratios -# - Incorrect network mode specifications -# - Missing required fields -``` - -### Resource Constraints -```bash -# Monitor VM resource usage -metald-cli info vm-12345 - -# Scale resources if needed -# Edit configuration file and recreate VM -# Or use hotplug for memory scaling -``` - -### Storage Issues -```bash -# Verify storage paths exist -ls -la /opt/vm-assets/ - -# Check storage device configuration -metald-cli info vm-12345 -json | jq '.config.storage' -``` \ No newline at end of file diff --git a/go/deploy/metald/client/examples/configs/database.json b/go/deploy/metald/client/examples/configs/database.json deleted file mode 100644 index 7dd9d63083..0000000000 --- a/go/deploy/metald/client/examples/configs/database.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "name": "database-server", - "description": "High-memory database server with persistent storage", - "template": "high-memory", - "cpu": { - "vcpu_count": 8, - "max_vcpu_count": 16 - }, - "memory": { - "size_mb": 32768, - "max_size_mb": 131072, - "hotplug_enabled": true - }, - "boot": { - "kernel_path": "/opt/vm-assets/vmlinux", - "kernel_args": "console=ttyS0 reboot=k panic=1 pci=off" - }, - "storage": [ - { - "id": "rootfs", - "path": "/opt/vm-assets/postgres-rootfs.ext4", - "read_only": false, - "is_root_device": true, - "interface_type": "virtio-blk", - "options": { - "docker_image": "postgres:15", - "auto_build": "true" - } - }, - { - "id": "data", - "path": "/opt/vm-assets/postgres-data.ext4", - "read_only": false, - "is_root_device": false, - "interface_type": "virtio-blk", - "options": { - "mount_point": "/var/lib/postgresql/data", - "filesystem": "ext4" - } - }, - { - "id": "logs", - "path": "/opt/vm-assets/postgres-logs.ext4", - "read_only": false, - "is_root_device": false, - "interface_type": "virtio-blk", - "options": { - "mount_point": "/var/log/postgresql", - "filesystem": "ext4" - } - }, - { - "id": "backup", - "path": "/opt/vm-assets/postgres-backup.ext4", - "read_only": false, - "is_root_device": false, - "interface_type": "virtio-blk", - "options": { - "mount_point": "/backup", - "filesystem": "ext4" - } - } - ], - "network": [ - { - "id": "private", - "interface_type": "virtio-net", - "mode": "ipv4_only", - "ipv4": { - "dhcp": false, - "static_ip": "10.0.1.100/24", - "gateway": "10.0.1.1", - "dns_servers": ["10.0.1.1", "8.8.8.8"] - } - } - ], - "console": { - "enabled": true, - "output": "/var/log/database-console.log", - "console_type": "serial" - }, - "metadata": { - "purpose": "database", - "environment": "production", - "team": "data", - "service": "postgresql", - "role": "primary", - "backup_enabled": "true", - "monitoring": "enabled" - } -} diff --git a/go/deploy/metald/client/examples/configs/development.json b/go/deploy/metald/client/examples/configs/development.json deleted file mode 100644 index c822d2093b..0000000000 --- a/go/deploy/metald/client/examples/configs/development.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "name": "development-environment", - "description": "Development VM with tools and workspace storage", - "template": "development", - "cpu": { - "vcpu_count": 6, - "max_vcpu_count": 12 - }, - "memory": { - "size_mb": 16384, - "max_size_mb": 65536, - "hotplug_enabled": true - }, - "boot": { - "kernel_path": "/opt/vm-assets/vmlinux", - "kernel_args": "console=ttyS0 reboot=k panic=1 pci=off" - }, - "storage": [ - { - "id": "rootfs", - "path": "/opt/vm-assets/dev-rootfs.ext4", - "read_only": false, - "is_root_device": true, - "interface_type": "virtio-blk", - "options": { - "docker_image": "ubuntu:22.04", - "auto_build": "true", - "packages": "git,curl,wget,vim,docker.io,build-essential,nodejs,npm,python3,python3-pip" - } - }, - { - "id": "workspace", - "path": "/opt/vm-assets/dev-workspace.ext4", - "read_only": false, - "is_root_device": false, - "interface_type": "virtio-blk", - "options": { - "mount_point": "/workspace", - "filesystem": "ext4", - "size": "100GB" - } - }, - { - "id": "docker", - "path": "/opt/vm-assets/dev-docker.ext4", - "read_only": false, - "is_root_device": false, - "interface_type": "virtio-blk", - "options": { - "mount_point": "/var/lib/docker", - "filesystem": "ext4", - "size": "50GB" - } - } - ], - "network": [ - { - "id": "eth0", - "interface_type": "virtio-net", - "mode": "dual_stack", - "ipv4": { - "dhcp": true - }, - "ipv6": { - "slaac": true, - "privacy_extensions": true - } - } - ], - "console": { - "enabled": true, - "output": "/tmp/dev-vm-console.log", - "console_type": "serial" - }, - "metadata": { - "template": "development", - "purpose": "development", - "environment": "dev", - "tools": "git,docker,nodejs,python", - "user": "developer", - "ssh_enabled": "true", - "code_server": "enabled" - } -} diff --git a/go/deploy/metald/client/examples/configs/minimal.json b/go/deploy/metald/client/examples/configs/minimal.json deleted file mode 100644 index 11f502d12f..0000000000 --- a/go/deploy/metald/client/examples/configs/minimal.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "name": "minimal", - "description": "Minimal VM configuration with basic resources for lightweight workloads", - "template": "minimal", - "cpu": { - "vcpu_count": 1, - "max_vcpu_count": 2 - }, - "memory": { - "size_mb": 512, - "max_size_mb": 1024, - "hotplug_enabled": false - }, - "boot": { - "kernel_path": "/opt/vm-assets/vmlinux", - "kernel_args": "console=ttyS0 reboot=k panic=1 pci=off nomodeset" - }, - "storage": [ - { - "id": "rootfs", - "path": "/opt/vm-assets/minimal-rootfs.ext4", - "read_only": false, - "is_root_device": true, - "interface_type": "virtio-blk" - } - ], - "network": [ - { - "id": "eth0", - "interface_type": "virtio-net", - "mode": "dual_stack", - "ipv4": { - "dhcp": true - }, - "ipv6": { - "slaac": true, - "privacy_extensions": true - } - } - ], - "console": { - "enabled": true, - "output": "/tmp/minimal-vm-console.log", - "console_type": "serial" - }, - "metadata": { - "template": "minimal", - "purpose": "lightweight" - } -} diff --git a/go/deploy/metald/client/examples/configs/web-server.json b/go/deploy/metald/client/examples/configs/web-server.json deleted file mode 100644 index 996815d8b6..0000000000 --- a/go/deploy/metald/client/examples/configs/web-server.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "name": "web-server", - "description": "High-performance web server with load balancing capabilities", - "template": "high-cpu", - "cpu": { - "vcpu_count": 8, - "max_vcpu_count": 16 - }, - "memory": { - "size_mb": 8192, - "max_size_mb": 32768, - "hotplug_enabled": true - }, - "boot": { - "kernel_path": "/opt/vm-assets/vmlinux", - "kernel_args": "console=ttyS0 reboot=k panic=1 pci=off" - }, - "storage": [ - { - "id": "rootfs", - "path": "/opt/vm-assets/nginx-rootfs.ext4", - "read_only": false, - "is_root_device": true, - "interface_type": "virtio-blk", - "options": { - "docker_image": "nginx:alpine", - "auto_build": "true" - } - }, - { - "id": "logs", - "path": "/opt/vm-assets/web-logs.ext4", - "read_only": false, - "is_root_device": false, - "interface_type": "virtio-blk" - } - ], - "network": [ - { - "id": "public", - "interface_type": "virtio-net", - "mode": "dual_stack", - "ipv4": { - "dhcp": true - }, - "ipv6": { - "slaac": true, - "privacy_extensions": false - } - } - ], - "console": { - "enabled": true, - "output": "/var/log/web-server-console.log", - "console_type": "serial" - }, - "metadata": { - "purpose": "web-server", - "environment": "production", - "team": "platform", - "service": "nginx", - "scaling_group": "web-tier" - } -} diff --git a/go/deploy/metald/client/go.mod b/go/deploy/metald/client/go.mod index 36a2120332..9c256ce241 100644 --- a/go/deploy/metald/client/go.mod +++ b/go/deploy/metald/client/go.mod @@ -1,6 +1,6 @@ module github.com/unkeyed/unkey/go/deploy/metald/client -go 1.24.4 +go 1.25.0 require ( connectrpc.com/connect v1.18.1 @@ -14,13 +14,13 @@ require ( github.com/spiffe/go-spiffe/v2 v2.5.0 // indirect github.com/unkeyed/unkey/go/deploy/pkg/spiffe v0.0.0-00010101000000-000000000000 // indirect github.com/zeebo/errs v1.4.0 // indirect - golang.org/x/crypto v0.39.0 // indirect + golang.org/x/crypto v0.40.0 // indirect golang.org/x/net v0.41.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sys v0.34.0 // indirect + golang.org/x/text v0.27.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 // indirect google.golang.org/grpc v1.73.0 // indirect - google.golang.org/protobuf v1.36.6 // indirect + google.golang.org/protobuf v1.36.8 // indirect ) replace github.com/unkeyed/unkey/go/deploy/metald => .. diff --git a/go/deploy/metald/client/go.sum b/go/deploy/metald/client/go.sum index 665693e8b6..ab71545fb2 100644 --- a/go/deploy/metald/client/go.sum +++ b/go/deploy/metald/client/go.sum @@ -36,19 +36,21 @@ go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFh go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= +golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= +golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= +golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= +golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 h1:fc6jSaCT0vBduLYZHYrBBNY4dsWuvgyff9noRNDdBeE= google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/go/deploy/metald/client/vmconfig.go b/go/deploy/metald/client/vmconfig.go index a843e24f05..29c9a86a63 100644 --- a/go/deploy/metald/client/vmconfig.go +++ b/go/deploy/metald/client/vmconfig.go @@ -186,8 +186,8 @@ func (b *VMConfigBuilder) AddIPv6OnlyNetwork(id string) *VMConfigBuilder { // AddNetworkWithCustomConfig adds a network interface with custom IPv4/IPv6 configuration func (b *VMConfigBuilder) AddNetworkWithCustomConfig(id, interfaceType string, mode vmprovisionerv1.NetworkMode, - ipv4Config *vmprovisionerv1.IPv4Config, ipv6Config *vmprovisionerv1.IPv6Config) *VMConfigBuilder { - + ipv4Config *vmprovisionerv1.IPv4Config, ipv6Config *vmprovisionerv1.IPv6Config, +) *VMConfigBuilder { if interfaceType == "" { interfaceType = "virtio-net" } @@ -335,71 +335,6 @@ const ( TemplateDevelopment VMTemplate = "development" ) -// NewVMConfigFromTemplate creates a VM configuration builder from a predefined template -func NewVMConfigFromTemplate(template VMTemplate) *VMConfigBuilder { - builder := NewVMConfigBuilder() - - switch template { - case TemplateMinimal: - builder.WithCPU(1, 2). - WithMemoryMB(512, 1024, false). // 512MB, max 1GB - WithDefaultBoot("console=ttyS0 reboot=k panic=1 pci=off nomodeset"). - AddRootStorage("/opt/vm-assets/minimal-rootfs.ext4"). - AddDefaultNetwork(). - WithDefaultConsole("/tmp/minimal-vm-console.log"). - AddMetadata("template", "minimal"). - AddMetadata("purpose", "lightweight") - - case TemplateStandard: - builder.WithCPU(2, 4). - WithMemoryGB(2, 8, true). // 2GB, max 8GB, hotplug enabled - WithDefaultBoot("console=ttyS0 reboot=k panic=1 pci=off"). - AddRootStorage("/opt/vm-assets/rootfs.ext4"). - AddDefaultNetwork(). - WithDefaultConsole("/tmp/standard-vm-console.log"). - AddMetadata("template", "standard"). - AddMetadata("purpose", "general") - - case TemplateHighCPU: - builder.WithCPU(8, 16). - WithMemoryGB(4, 16, true). // 4GB, max 16GB - WithDefaultBoot("console=ttyS0 reboot=k panic=1 pci=off"). - AddRootStorage("/opt/vm-assets/rootfs.ext4"). - AddDefaultNetwork(). - WithDefaultConsole("/tmp/high-cpu-vm-console.log"). - AddMetadata("template", "high-cpu"). - AddMetadata("purpose", "compute-intensive") - - case TemplateHighMemory: - builder.WithCPU(4, 8). - WithMemoryGB(16, 64, true). // 16GB, max 64GB - WithDefaultBoot("console=ttyS0 reboot=k panic=1 pci=off"). - AddRootStorage("/opt/vm-assets/rootfs.ext4"). - AddDefaultNetwork(). - WithDefaultConsole("/tmp/high-memory-vm-console.log"). - AddMetadata("template", "high-memory"). - AddMetadata("purpose", "memory-intensive") - - case TemplateDevelopment: - builder.WithCPU(4, 8). - WithMemoryGB(8, 32, true). // 8GB, max 32GB - WithDefaultBoot("console=ttyS0 reboot=k panic=1 pci=off"). - AddRootStorage("/opt/vm-assets/dev-rootfs.ext4"). - AddDataStorage("workspace", "/opt/vm-assets/dev-workspace.ext4", false). - AddDefaultNetwork(). - WithDefaultConsole("/tmp/dev-vm-console.log"). - AddMetadata("template", "development"). - AddMetadata("purpose", "development"). - AddMetadata("environment", "dev") - - default: - // Return standard template for unknown templates - return NewVMConfigFromTemplate(TemplateStandard) - } - - return builder -} - // ForDockerImage configures the VM for running a specific Docker image func (b *VMConfigBuilder) ForDockerImage(imageName string) *VMConfigBuilder { // Add Docker-specific metadata and storage configuration diff --git a/go/deploy/metald/cmd/metald-init/main.go b/go/deploy/metald/cmd/metald-init/main.go index 09a4610f71..88996a01f1 100644 --- a/go/deploy/metald/cmd/metald-init/main.go +++ b/go/deploy/metald/cmd/metald-init/main.go @@ -44,7 +44,7 @@ func main() { log.SetPrefix("[init] ") // AIDEV-NOTE: Write debug file with secure permissions - os.WriteFile("/init.started", []byte(fmt.Sprintf("Started at %s\n", time.Now())), 0600) + os.WriteFile("/init.started", []byte(fmt.Sprintf("Started at %s\n", time.Now())), 0o600) // AIDEV-NOTE: Mount /proc filesystem so we can read kernel command line if err := syscall.Mount("proc", "/proc", "proc", 0, ""); err != nil { @@ -119,7 +119,7 @@ func main() { // AIDEV-NOTE: Write debug info with secure permissions debugInfo := fmt.Sprintf("Command: %s\nArgs: %v\nEnv count: %d\nWorking dir: %s\n", command, commandArgs, len(os.Environ()), os.Getenv("PWD")) - os.WriteFile("/init.command", []byte(debugInfo), 0600) + os.WriteFile("/init.command", []byte(debugInfo), 0o600) // AIDEV-NOTE: Load container environment configuration for complete runtime replication containerEnv, err := loadContainerEnvironment() @@ -206,7 +206,7 @@ func parseKernelCmdline() map[string]string { } // Parse space-separated key=value pairs - for _, param := range strings.Fields(string(cmdline)) { + for param := range strings.FieldsSeq(string(cmdline)) { if strings.Contains(param, "=") { parts := strings.SplitN(param, "=", 2) params[parts[0]] = parts[1] @@ -445,7 +445,7 @@ func createCommonDirectories() { } for _, dir := range commonDirs { - if err := os.MkdirAll(dir, 0755); err != nil { + if err := os.MkdirAll(dir, 0o755); err != nil { log.Printf("warning: failed to create directory %s: %v", dir, err) } else { log.Printf("ensured directory exists: %s", dir) @@ -598,13 +598,13 @@ Options: Environment: The init process reads kernel command line parameters from /proc/cmdline: - + env.KEY=VALUE Set environment variable KEY to VALUE workdir=/path Change working directory to /path - + Example: %s -- nginx -g "daemon off;" - + With kernel cmdline: env.NGINX_PORT=8080 workdir=/app `, binaryName, binaryName, binaryName) fmt.Print(help) diff --git a/go/deploy/metald/cmd/metald/main.go b/go/deploy/metald/cmd/metald/main.go index 7d51128d3f..464dd45fbd 100644 --- a/go/deploy/metald/cmd/metald/main.go +++ b/go/deploy/metald/cmd/metald/main.go @@ -15,20 +15,17 @@ import ( "time" "github.com/unkeyed/unkey/go/deploy/metald/internal/assetmanager" - "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/docker" "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/firecracker" "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" "github.com/unkeyed/unkey/go/deploy/metald/internal/billing" "github.com/unkeyed/unkey/go/deploy/metald/internal/config" "github.com/unkeyed/unkey/go/deploy/metald/internal/database" - "github.com/unkeyed/unkey/go/deploy/metald/internal/network" "github.com/unkeyed/unkey/go/deploy/metald/internal/observability" - "github.com/unkeyed/unkey/go/deploy/metald/internal/reconciler" "github.com/unkeyed/unkey/go/deploy/metald/internal/service" healthpkg "github.com/unkeyed/unkey/go/deploy/pkg/health" "github.com/unkeyed/unkey/go/deploy/pkg/observability/interceptors" tlspkg "github.com/unkeyed/unkey/go/deploy/pkg/tls" - "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1/vmprovisionerv1connect" + "github.com/unkeyed/unkey/go/gen/proto/metald/v1/metaldv1connect" "connectrpc.com/connect" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -38,10 +35,8 @@ import ( ) // version is set at build time via ldflags -var version = "" // AIDEV-NOTE: Version injected at build time via Makefile LDFLAGS +var version = "" -// AIDEV-NOTE: Enhanced version management with debug.ReadBuildInfo fallback -// Handles production builds (ldflags), development builds (git commit), and module builds // getVersion returns the version string, with fallback to debug.ReadBuildInfo func getVersion() string { // If version was set via ldflags (production builds), use it @@ -59,30 +54,25 @@ func getVersion() string { // Try to get version from VCS info for _, setting := range info.Settings { if setting.Key == "vcs.revision" && len(setting.Value) >= 7 { - return "dev-" + setting.Value[:7] // First 7 chars of commit hash + return "dev-" + setting.Value[:7] // First 8 chars of commit hash } } - // Last resort: indicate it's a development build return "dev" } - // Final fallback return version } func main() { - // Track application start time for uptime calculations startTime := time.Now() - // Parse command-line flags var ( showHelp = flag.Bool("help", false, "Show help information") showVersion = flag.Bool("version", false, "Show version information") ) flag.Parse() - // Handle help and version flags if *showHelp { printUsage() os.Exit(0) @@ -96,17 +86,16 @@ func main() { // Initialize structured logger with JSON output //exhaustruct:ignore logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ - Level: slog.LevelInfo, + Level: slog.LevelDebug, })) slog.SetDefault(logger) // Log startup - logger.Info("starting vmm control plane", + logger.Info("starting metald", slog.String("version", getVersion()), slog.String("go_version", runtime.Version()), ) - // Load configuration cfg, err := config.LoadConfig() if err != nil { logger.Error("failed to load configuration", @@ -116,7 +105,7 @@ func main() { } logger.Info("configuration loaded", - slog.String("backend", string(cfg.Backend.Type)), + slog.String("address", cfg.Server.Address), slog.String("port", cfg.Server.Port), slog.Bool("otel_enabled", cfg.OpenTelemetry.Enabled), @@ -186,52 +175,18 @@ func main() { "mode", cfg.TLS.Mode, "spiffe_enabled", cfg.TLS.Mode == "spiffe") - // Initialize database - db, err := database.NewWithLogger(cfg.Database.DataDir, logger) - if err != nil { - logger.Error("failed to initialize database", - slog.String("error", err.Error()), - slog.String("data_dir", cfg.Database.DataDir), + db, dbErr := database.NewDatabaseWithLogger(cfg.Database.DataDir, slog.Default()) + if dbErr != nil { + logger.Error("failed to get DB", + slog.String("error", dbErr.Error()), ) os.Exit(1) } - defer db.Close() - - // Create VM repository - vmRepo := database.NewVMRepository(db) - - logger.Info("database initialized", - slog.String("data_dir", cfg.Database.DataDir), - ) // Initialize backend based on configuration var backend types.Backend switch cfg.Backend.Type { case types.BackendTypeFirecracker: - // Use SDK client v4 with integrated jailer - let SDK handle complete lifecycle - // AIDEV-NOTE: SDK manages firecracker process, integrated jailer, and networking - - // Convert main config to network config - networkConfig := &network.Config{ - BridgeName: cfg.Network.BridgeName, - BridgeIP: cfg.Network.BridgeIPv4, - VMSubnet: cfg.Network.VMSubnetIPv4, - EnableIPv6: cfg.Network.EnableIPv6, - DNSServers: cfg.Network.DNSServersIPv4, - EnableRateLimit: cfg.Network.EnableRateLimit, - RateLimitMbps: cfg.Network.RateLimitMbps, - PortRangeMin: 32768, // Default - PortRangeMax: 65535, // Default - } - - networkManager, err := network.NewManager(logger, networkConfig, &cfg.Network) - if err != nil { - logger.Error("failed to create network manager", - slog.String("error", err.Error()), - ) - os.Exit(1) - } - // Base directory for VM data baseDir := "/opt/metald/vms" @@ -256,50 +211,36 @@ func main() { logger.Info("assetmanager disabled, using noop client") } - // Use SDK v4 with integrated jailer - the only supported backend - sdkClient, err := firecracker.NewSDKClientV4(logger, networkManager, assetClient, vmRepo, &cfg.Backend.Jailer, baseDir) + sdkClient, err := firecracker.NewClient(logger, assetClient, &cfg.Backend.Jailer, baseDir) if err != nil { - logger.Error("failed to create SDK client v4 with integrated jailer", + logger.Error("failed to create firecracker client", slog.String("error", err.Error()), ) os.Exit(1) } - logger.Info("initialized firecracker SDK v4 backend with integrated jailer", + logger.Info("initialized firecracker backend", slog.String("firecracker_binary", "/usr/local/bin/firecracker"), slog.Uint64("uid", uint64(cfg.Backend.Jailer.UID)), slog.Uint64("gid", uint64(cfg.Backend.Jailer.GID)), slog.String("chroot_base", cfg.Backend.Jailer.ChrootBaseDir), ) - if err := sdkClient.Initialize(); err != nil { - logger.Error("failed to initialize SDK client v4", - slog.String("error", err.Error()), - ) - os.Exit(1) - } backend = sdkClient - - // Note: Network manager is initialized and managed by SDK v4 case types.BackendTypeDocker: - // AIDEV-NOTE: Docker backend for development - creates containers instead of VMs - logger.Info("initializing Docker backend for development") - - dockerClient, err := docker.NewDockerBackend(logger, docker.DefaultDockerBackendConfig()) - if err != nil { - logger.Error("failed to create Docker backend", - slog.String("error", err.Error()), - ) - os.Exit(1) - } - - backend = dockerClient - logger.Info("Docker backend initialized successfully") - case types.BackendTypeCloudHypervisor: - logger.Error("CloudHypervisor backend not implemented", - slog.String("backend", string(cfg.Backend.Type)), - ) - os.Exit(1) + // // AIDEV-NOTE: Docker backend for development - creates containers instead of VMs + // logger.Info("initializing Docker backend for development") + + // dockerClient, err := docker.NewDockerBackend(logger, docker.DefaultDockerBackendConfig()) + // if err != nil { + // logger.Error("failed to create Docker backend", + // slog.String("error", err.Error()), + // ) + // os.Exit(1) + // } + + // backend = dockerClient + // logger.Info("Docker backend initialized successfully") default: logger.Error("unsupported backend type", slog.String("backend", string(cfg.Backend.Type)), @@ -366,20 +307,7 @@ func main() { metricsCollector.StartHeartbeat() // Create VM service - vmService := service.NewVMService(backend, logger, metricsCollector, vmMetrics, vmRepo) - - // Initialize VM reconciler to fix stale VM state issues - // AIDEV-NOTE: Critical fix for state inconsistency where database shows VMs but no processes exist - vmReconciler := reconciler.NewVMReconciler(logger, backend, vmRepo, 5*time.Minute) - - // Start VM reconciler in background - reconcilerCtx, cancelReconciler := context.WithCancel(ctx) - defer cancelReconciler() - - go vmReconciler.Start(reconcilerCtx) - logger.Info("VM reconciler started", - slog.Duration("interval", 5*time.Minute), - ) + vmService := service.NewVMService(backend, logger, metricsCollector, vmMetrics, db.Queries) // Create unified health handler healthHandler := healthpkg.Handler("metald", getVersion(), startTime) @@ -392,7 +320,7 @@ func main() { interceptors.WithServiceName("metald"), interceptors.WithLogger(logger), interceptors.WithActiveRequestsMetric(true), - interceptors.WithRequestDurationMetric(false), // Match existing behavior + interceptors.WithRequestDurationMetric(true), // Match existing behavior interceptors.WithErrorResampling(true), interceptors.WithPanicStackTrace(true), interceptors.WithTenantAuth(true, @@ -418,7 +346,7 @@ func main() { } mux := http.NewServeMux() - path, handler := vmprovisionerv1connect.NewVmServiceHandler(vmService, + path, handler := metaldv1connect.NewVmServiceHandler(vmService, connect.WithInterceptors(interceptorList...), ) mux.Handle(path, handler) @@ -440,13 +368,12 @@ func main() { // Configure server with optional TLS and security timeouts server := &http.Server{ - Addr: addr, - Handler: h2c.NewHandler(httpHandler, &http2.Server{}), //nolint:exhaustruct - // AIDEV-NOTE: Security timeouts to prevent slowloris attacks - ReadTimeout: 30 * time.Second, // Time to read request headers - WriteTimeout: 30 * time.Second, // Time to write response - IdleTimeout: 120 * time.Second, // Keep-alive timeout - MaxHeaderBytes: 1 << 20, // 1MB max header size + Addr: addr, + Handler: h2c.NewHandler(httpHandler, &http2.Server{}), //nolint:exhaustruct + ReadTimeout: 30 * time.Second, // Time to read request headers + WriteTimeout: 30 * time.Second, // Time to write response + IdleTimeout: 120 * time.Second, // Keep-alive timeout + MaxHeaderBytes: 1 << 20, // 1MB max header size } // Apply TLS configuration if enabled diff --git a/go/deploy/metald/configs/cni/metald-network.conflist b/go/deploy/metald/configs/cni/metald-network.conflist deleted file mode 100644 index 4d4734ea7e..0000000000 --- a/go/deploy/metald/configs/cni/metald-network.conflist +++ /dev/null @@ -1,18 +0,0 @@ -{ - "cniVersion": "0.4.0", - "name": "metald-network", - "plugins": [ - { - "type": "ptp", - "ipMasq": true, - "ipam": { - "type": "host-local", - "subnet": "10.100.0.0/16", - "resolvConf": "/etc/resolv.conf" - } - }, - { - "type": "tc-redirect-tap" - } - ] -} \ No newline at end of file diff --git a/go/deploy/metald/contrib/systemd/README.md b/go/deploy/metald/contrib/systemd/README.md deleted file mode 100644 index e414d663ec..0000000000 --- a/go/deploy/metald/contrib/systemd/README.md +++ /dev/null @@ -1,88 +0,0 @@ -# Systemd Integration for Metald - -This directory contains systemd service files and deployment scripts for metald. - -## Files - -- `metald.service` - Production-ready systemd service unit with security hardening -- `fedora-installation.md` - Complete installation guide for Fedora 42 systems -- `metald.env.example` - Example environment configuration file -- `install.sh` - Automated installation script for systemd-based systems - -## Quick Installation - -```bash -# From the metald root directory -make service-install -``` - -## Manual Installation - -```bash -# Copy service file -sudo cp contrib/systemd/metald.service /etc/systemd/system/ - -# Copy environment file -sudo mkdir -p /etc/metald -sudo cp contrib/systemd/metald.env.example /etc/metald/metald.env - -# Edit configuration as needed -sudo vim /etc/metald/metald.env - -# Install and start service -sudo systemctl daemon-reload -sudo systemctl enable metald -sudo systemctl start metald -``` - -## Service Management - -```bash -# Check status -sudo systemctl status metald - -# View logs -sudo journalctl -u metald -f - -# Restart service -sudo systemctl restart metald - -# Stop service -sudo systemctl stop metald -``` - -## Security Features - -The systemd service includes comprehensive security hardening: - -- Process isolation with dedicated user account -- Filesystem protection and read-only system directories -- Network and namespace restrictions -- System call filtering -- Resource limits (memory, CPU, file descriptors) -- Privilege dropping and capability restrictions - -## Configuration - -The service supports configuration via: - -1. Environment variables in `/etc/metald/metald.env` -2. Command-line arguments (modify `ExecStart` in service file) -3. Configuration files (if implemented in metald) - -## Troubleshooting - -See `fedora-installation.md` for detailed troubleshooting steps and common issues. - -For systemd-specific issues: - -```bash -# Check service validation -sudo systemd-analyze verify /etc/systemd/system/metald.service - -# Check security settings -sudo systemd-analyze security metald - -# Debug service startup -sudo systemctl show metald -``` \ No newline at end of file diff --git a/go/deploy/metald/contrib/systemd/environment.example b/go/deploy/metald/contrib/systemd/environment.example deleted file mode 100644 index 1a94d7b416..0000000000 --- a/go/deploy/metald/contrib/systemd/environment.example +++ /dev/null @@ -1,72 +0,0 @@ -# Metald Environment Variables Template -# NOTE: This service does NOT load .env files automatically -# Set these variables in your system environment or process manager -# -# Usage examples: -# systemd: EnvironmentFile=/etc/metald/environment -# Docker: docker run --env-file environment metald -# Shell: set -a; source environment; set +a; ./metald - -# Server Configuration -UNKEY_METALD_BACKEND=firecracker -UNKEY_METALD_PORT=8080 -UNKEY_METALD_ADDRESS=0.0.0.0 - -# Database Configuration -UNKEY_METALD_DATA_DIR=/opt/metald/data - -# Billing Configuration -UNKEY_METALD_BILLING_ENABLED=true -UNKEY_METALD_BILLING_ENDPOINT=http://localhost:8081 -UNKEY_METALD_BILLING_MOCK_MODE=false - -# AssetManager Configuration (VM Asset Management) -UNKEY_METALD_ASSETMANAGER_ENABLED=true -UNKEY_METALD_ASSETMANAGER_ENDPOINT=http://localhost:8083 -UNKEY_METALD_ASSETMANAGER_CACHE_DIR=/opt/metald/assets - -# Network Configuration -UNKEY_METALD_NETWORK_ENABLED=true -UNKEY_METALD_NETWORK_IPV4_ENABLED=true -UNKEY_METALD_NETWORK_BRIDGE_IPV4=172.31.0.1/19 -UNKEY_METALD_NETWORK_VM_SUBNET_IPV4=172.31.0.0/19 -UNKEY_METALD_NETWORK_DNS_IPV4=8.8.8.8,8.8.4.4 -UNKEY_METALD_NETWORK_IPV6_ENABLED=true -UNKEY_METALD_NETWORK_BRIDGE_IPV6=fd00::1/64 -UNKEY_METALD_NETWORK_VM_SUBNET_IPV6=fd00::/64 -UNKEY_METALD_NETWORK_DNS_IPV6=2606:4700:4700::1111,2606:4700:4700::1001 -UNKEY_METALD_NETWORK_IPV6_MODE=dual-stack -UNKEY_METALD_NETWORK_BRIDGE=br-vms -UNKEY_METALD_NETWORK_RATE_LIMIT=true -UNKEY_METALD_NETWORK_RATE_LIMIT_MBPS=1000 -UNKEY_METALD_NETWORK_MAX_VMS_PER_BRIDGE=1000 -UNKEY_METALD_NETWORK_MULTI_BRIDGE=true -UNKEY_METALD_NETWORK_BRIDGE_PREFIX=metald-br -UNKEY_METALD_NETWORK_HOST_PROTECTION=true -UNKEY_METALD_NETWORK_PRIMARY_INTERFACE= - -# TLS Configuration -UNKEY_METALD_TLS_MODE=spiffe -UNKEY_METALD_SPIFFE_SOCKET=/var/lib/spire/agent/agent.sock -UNKEY_METALD_TLS_CERT_FILE= -UNKEY_METALD_TLS_KEY_FILE= -UNKEY_METALD_TLS_CA_FILE= -UNKEY_METALD_TLS_ENABLE_CERT_CACHING=true -UNKEY_METALD_TLS_CERT_CACHE_TTL=5s - -# OpenTelemetry Configuration -UNKEY_METALD_OTEL_ENABLED=false -UNKEY_METALD_OTEL_SERVICE_NAME=metald -UNKEY_METALD_OTEL_SERVICE_VERSION=0.1.0 -UNKEY_METALD_OTEL_SAMPLING_RATE=1.0 -UNKEY_METALD_OTEL_ENDPOINT=localhost:4318 -UNKEY_METALD_OTEL_PROMETHEUS_ENABLED=true -UNKEY_METALD_OTEL_PROMETHEUS_PORT=9464 -UNKEY_METALD_OTEL_PROMETHEUS_INTERFACE=127.0.0.1 -UNKEY_METALD_OTEL_HIGH_CARDINALITY_ENABLED=false - -# Integrated Jailer Configuration (Production Security - Firecracker only) -# Note: Metald now includes an integrated jailer implementation -UNKEY_METALD_JAILER_UID=1000 -UNKEY_METALD_JAILER_GID=1000 -UNKEY_METALD_JAILER_CHROOT_DIR=/srv/jailer \ No newline at end of file diff --git a/go/deploy/metald/contrib/systemd/fedora-installation.md b/go/deploy/metald/contrib/systemd/fedora-installation.md deleted file mode 100644 index b24c5519a1..0000000000 --- a/go/deploy/metald/contrib/systemd/fedora-installation.md +++ /dev/null @@ -1,426 +0,0 @@ -# Metald Installation Guide for Fedora 42 - -This guide covers secure installation and configuration of metald on Fedora 42 systems. - -## Prerequisites - -### System Requirements - -- Fedora 42 with systemd -- Go 1.21+ (for building from source) -- Root or sudo access for installation -- At least 4GB RAM and 2 CPU cores for VM workloads - -### Required Packages - -```bash -# Update system -sudo dnf update -y - -# Install development tools and dependencies -sudo dnf install -y \ - golang \ - git \ - make \ - curl \ - jq \ - systemd-devel \ - cgroup-tools \ - iptables \ - bridge-utils - -# Install Firecracker (if using Firecracker backend) -# Download latest release from https://github.com/firecracker-microvm/firecracker/releases -sudo curl -L https://github.com/firecracker-microvm/firecracker/releases/latest/download/firecracker-v1.5.1-x86_64.tgz \ - -o /tmp/firecracker.tgz -sudo tar -xzf /tmp/firecracker.tgz -C /tmp -sudo cp /tmp/release-v1.5.1-x86_64/firecracker-v1.5.1-x86_64 /usr/bin/firecracker -sudo chmod +x /usr/bin/firecracker - -# Verify Firecracker installation -firecracker --version -``` - -## Security Setup - -### 1. Create Dedicated System User - -```bash -# Create metald system user with restricted permissions -sudo useradd -r -s /bin/false -d /opt/metald -c "Metald VM Management Service" metald - -# Verify user creation -id metald -# Should show: uid=995(metald) gid=993(metald) groups=993(metald) -``` - -### 2. Set Up Directory Structure - -```bash -# Create application directories -sudo mkdir -p /opt/metald -sudo mkdir -p /var/log/metald -sudo mkdir -p /etc/metald - -# Create runtime directories -sudo mkdir -p /tmp/github.com/unkeyed/unkey/go/deploy/metald/sockets -sudo mkdir -p /tmp/github.com/unkeyed/unkey/go/deploy/metald/logs - -# Create jailer chroot directory (for production) -sudo mkdir -p /srv/jailer - -# Set ownership -sudo chown -R metald:metald /opt/metald -sudo chown -R metald:metald /var/log/metald -sudo chown -R metald:metald /etc/metald -sudo chown -R metald:metald /tmp/github.com/unkeyed/unkey/go/deploy/metald -sudo chown -R metald:metald /srv/jailer - -# Set permissions -sudo chmod 755 /opt/metald -sudo chmod 750 /var/log/metald -sudo chmod 750 /etc/metald -sudo chmod 755 /srv/jailer -``` - -### 3. Configure Cgroups (Required for Resource Limits) - -```bash -# Ensure cgroups v1 is available (required by Firecracker jailer) -sudo mkdir -p /sys/fs/cgroup/metald - -# Add metald user to systemd-journal group for logging -sudo usermod -a -G systemd-journal metald -``` - -### 4. Configure Firewall - -```bash -# Configure firewalld for metald services -sudo firewall-cmd --permanent --new-service=metald -sudo firewall-cmd --permanent --service=metald --set-description="Metald VM Management Service" -sudo firewall-cmd --permanent --service=metald --set-short="Metald" -sudo firewall-cmd --permanent --service=metald --add-port=8080/tcp -sudo firewall-cmd --permanent --service=metald --add-port=9464/tcp - -# Enable the service -sudo firewall-cmd --permanent --add-service=metald -sudo firewall-cmd --reload - -# Verify firewall configuration -sudo firewall-cmd --list-services | grep metald -sudo firewall-cmd --list-ports -``` - -## Installation Methods - -### Method 1: Using Makefile (Recommended) - -```bash -# Clone the repository -git clone https://github.com/unkeyed/unkey.git -cd unkey/go/deploy/metald - -# Build and install -make install - -# Enable and start the service -make service-install -make service-start - -# Check status -make service-status -``` - -### Method 2: Manual Installation - -```bash -# Build metald -go build -ldflags "-s -w" -o build/metald ./cmd/api - -# Install binary -sudo cp build/metald /usr/local/bin/metald -sudo chmod +x /usr/local/bin/metald - -# Install systemd service -sudo cp metald.service /etc/systemd/system/metald.service -sudo systemctl daemon-reload -sudo systemctl enable metald -sudo systemctl start metald -``` - -## Configuration - -### Environment Variables - -Create a configuration file for environment variables: - -```bash -# Create environment file -sudo tee /etc/metald/metald.env > /dev/null < /dev/null < /dev/null < 0 { - //exhaustruct:ignore - chConfig.Cpus = &cpusConfig{ - BootVcpus: config.GetCpu().GetVcpuCount(), - MaxVcpus: config.GetCpu().GetMaxVcpuCount(), - } - if config.GetCpu().GetMaxVcpuCount() == 0 { - chConfig.Cpus.MaxVcpus = config.GetCpu().GetVcpuCount() - } - } - - // Memory configuration - if config.GetMemory() != nil && config.GetMemory().GetSizeBytes() > 0 { - //exhaustruct:ignore - chConfig.Memory = &memoryConfig{ - Size: config.GetMemory().GetSizeBytes(), - } - } - - // Payload configuration - if config.GetBoot() != nil && config.GetBoot().GetKernelPath() != "" { - chConfig.Payload = &payloadConfig{ - Kernel: config.GetBoot().GetKernelPath(), - Initramfs: config.GetBoot().GetInitrdPath(), - Cmdline: config.GetBoot().GetKernelArgs(), - } - } - - // Disk configuration - for _, disk := range config.GetStorage() { - //exhaustruct:ignore - chConfig.Disks = append(chConfig.Disks, diskConfig{ - Path: disk.GetPath(), - Readonly: disk.GetReadOnly(), - }) - } - - // Network configuration - for _, net := range config.GetNetwork() { - //exhaustruct:ignore - chConfig.Net = append(chConfig.Net, netConfig{ - Tap: net.GetTapDevice(), - Mac: net.GetMacAddress(), - }) - } - - // Default RNG configuration - chConfig.Rng = &rngConfig{ - Src: "/dev/urandom", - } - - // Console configuration - if config.GetConsole() != nil && config.GetConsole().GetEnabled() { - //exhaustruct:ignore - chConfig.Console = &consoleConfig{ - Mode: "File", - File: config.GetConsole().GetOutput(), - } - } else { - //exhaustruct:ignore - chConfig.Console = &consoleConfig{ - Mode: "Off", - } - } - - return chConfig -} - -// cloudHypervisorStateToGeneric converts Cloud Hypervisor state to generic VM state -func (c *Client) cloudHypervisorStateToGeneric(state string) metaldv1.VmState { - switch state { - case "Created": - return metaldv1.VmState_VM_STATE_CREATED - case "Running": - return metaldv1.VmState_VM_STATE_RUNNING - case "Shutdown": - return metaldv1.VmState_VM_STATE_SHUTDOWN - case "Paused": - return metaldv1.VmState_VM_STATE_PAUSED - default: - return metaldv1.VmState_VM_STATE_UNSPECIFIED - } -} diff --git a/go/deploy/metald/internal/backend/docker/client.go b/go/deploy/metald/internal/backend/docker/client.go index ba43700fde..a66b93ed1d 100644 --- a/go/deploy/metald/internal/backend/docker/client.go +++ b/go/deploy/metald/internal/backend/docker/client.go @@ -1,913 +1,941 @@ package docker -import ( - "context" - "encoding/json" - "fmt" - "io" - "log/slog" - "math/rand" - "strconv" - "strings" - "sync" - "time" - - "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/image" - "github.com/docker/docker/client" - "github.com/docker/go-connections/nat" - backendtypes "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" - "go.opentelemetry.io/otel/trace" -) - -// DockerBackend implements the Backend interface using Docker containers -type DockerBackend struct { - logger *slog.Logger - dockerClient *client.Client - config *DockerBackendConfig - vmRegistry map[string]*dockerVM - portAllocator *portAllocator - mutex sync.RWMutex - tracer trace.Tracer - meter metric.Meter - vmCreateCounter metric.Int64Counter - vmDeleteCounter metric.Int64Counter - vmBootCounter metric.Int64Counter - vmErrorCounter metric.Int64Counter -} - -// portAllocator manages port allocation for containers -type portAllocator struct { - mutex sync.Mutex - allocated map[int]string // port -> vmID - minPort int - maxPort int -} - -// NewDockerBackend creates a new Docker backend -func NewDockerBackend(logger *slog.Logger, config *DockerBackendConfig) (*DockerBackend, error) { - if config == nil { - config = DefaultDockerBackendConfig() - } - - // Create Docker client - dockerClient, err := client.NewClientWithOpts( - client.FromEnv, - client.WithAPIVersionNegotiation(), - ) - if err != nil { - return nil, fmt.Errorf("failed to create Docker client: %w", err) - } - - // Verify Docker connection - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - if _, err := dockerClient.Ping(ctx); err != nil { - return nil, fmt.Errorf("failed to connect to Docker daemon: %w", err) - } - - // Set up telemetry - tracer := otel.Tracer("metald.docker.backend") - meter := otel.Meter("metald.docker.backend") - - vmCreateCounter, err := meter.Int64Counter("vm_create_total", - metric.WithDescription("Total number of VM create operations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_create counter: %w", err) - } - - vmDeleteCounter, err := meter.Int64Counter("vm_delete_total", - metric.WithDescription("Total number of VM delete operations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_delete counter: %w", err) - } - - vmBootCounter, err := meter.Int64Counter("vm_boot_total", - metric.WithDescription("Total number of VM boot operations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_boot counter: %w", err) - } - - vmErrorCounter, err := meter.Int64Counter("vm_error_total", - metric.WithDescription("Total number of VM operation errors"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_error counter: %w", err) - } - - // Create port allocator - portAllocator := &portAllocator{ - allocated: make(map[int]string), - minPort: config.PortRange.Min, - maxPort: config.PortRange.Max, - } - - backend := &DockerBackend{ - logger: logger.With("backend", "docker"), - dockerClient: dockerClient, - config: config, - vmRegistry: make(map[string]*dockerVM), - portAllocator: portAllocator, - tracer: tracer, - meter: meter, - vmCreateCounter: vmCreateCounter, - vmDeleteCounter: vmDeleteCounter, - vmBootCounter: vmBootCounter, - vmErrorCounter: vmErrorCounter, - } - - - return backend, nil -} - -// CreateVM creates a new Docker container representing a VM -func (d *DockerBackend) CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) { - ctx, span := d.tracer.Start(ctx, "metald.docker.create_vm", - trace.WithAttributes( - attribute.Int("vcpus", int(config.GetCpu().GetVcpuCount())), - attribute.Int64("memory_bytes", config.GetMemory().GetSizeBytes()), - ), - ) - defer span.End() - - // Generate VM ID - vmID := d.generateVMID() - span.SetAttributes(attribute.String("vm_id", vmID)) - - d.logger.LogAttrs(ctx, slog.LevelInfo, "creating VM with Docker backend", - slog.String("vm_id", vmID), - slog.Int("vcpus", int(config.GetCpu().GetVcpuCount())), - slog.Int64("memory_bytes", config.GetMemory().GetSizeBytes()), - ) - - // Convert VM config to container spec - containerSpec, err := d.vmConfigToContainerSpec(ctx, vmID, config) - if err != nil { - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "create"), - attribute.String("error", "config_conversion"), - )) - return "", fmt.Errorf("failed to convert VM config to container spec: %w", err) - } - - // Create container - containerID, err := d.createContainer(ctx, containerSpec) - if err != nil { - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "create"), - attribute.String("error", "container_creation"), - )) - return "", fmt.Errorf("failed to create container: %w", err) - } - - // Register VM - d.mutex.Lock() - vm := &dockerVM{ - ID: vmID, - ContainerID: containerID, - Config: config, - State: metaldv1.VmState_VM_STATE_CREATED, - PortMappings: containerSpec.PortMappings, - CreatedAt: time.Now(), - } - d.vmRegistry[vmID] = vm - d.mutex.Unlock() - - d.vmCreateCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("status", "success"), - )) - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM created successfully with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", containerID), - ) - - return vmID, nil -} - -// BootVM starts a Docker container representing a VM -func (d *DockerBackend) BootVM(ctx context.Context, vmID string) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.boot_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.mutex.Lock() - vm, exists := d.vmRegistry[vmID] - d.mutex.Unlock() - - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "boot"), - attribute.String("error", "vm_not_found"), - )) - return err - } - - d.logger.LogAttrs(ctx, slog.LevelInfo, "booting VM with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - // Check if container still exists before starting - _, err := d.dockerClient.ContainerInspect(ctx, vm.ContainerID) - if err != nil { - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "boot"), - attribute.String("error", "container_not_found"), - )) - return fmt.Errorf("container not found before start: %w", err) - } - - // Start container - if err := d.dockerClient.ContainerStart(ctx, vm.ContainerID, container.StartOptions{}); err != nil { - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "boot"), - attribute.String("error", "container_start"), - )) - return fmt.Errorf("failed to start container: %w", err) - } - - // Update VM state and network info - d.mutex.Lock() - vm.State = metaldv1.VmState_VM_STATE_RUNNING - - // Get container network info - networkInfo, err := d.getContainerNetworkInfo(ctx, vm.ContainerID) - if err != nil { - d.logger.WarnContext(ctx, "failed to get container network info", - "vm_id", vmID, - "container_id", vm.ContainerID, - "error", err, - ) - } else { - vm.NetworkInfo = networkInfo - } - d.mutex.Unlock() - - d.vmBootCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("status", "success"), - )) - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM booted successfully with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - return nil -} - -// DeleteVM removes a Docker container representing a VM -func (d *DockerBackend) DeleteVM(ctx context.Context, vmID string) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.delete_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.mutex.Lock() - vm, exists := d.vmRegistry[vmID] - if !exists { - d.mutex.Unlock() - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "delete"), - attribute.String("error", "vm_not_found"), - )) - return err - } - - // Release allocated ports - for _, mapping := range vm.PortMappings { - d.portAllocator.releasePort(mapping.HostPort, vmID) - } - - delete(d.vmRegistry, vmID) - d.mutex.Unlock() - - d.logger.LogAttrs(ctx, slog.LevelInfo, "deleting VM with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - // Remove container (force remove) - if err := d.dockerClient.ContainerRemove(ctx, vm.ContainerID, container.RemoveOptions{ - Force: true, - }); err != nil { - span.RecordError(err) - d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "delete"), - attribute.String("error", "container_removal"), - )) - return fmt.Errorf("failed to remove container: %w", err) - } - - d.vmDeleteCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("status", "success"), - )) - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM deleted successfully with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - return nil -} - -// ShutdownVM gracefully shuts down a Docker container -func (d *DockerBackend) ShutdownVM(ctx context.Context, vmID string) error { - return d.ShutdownVMWithOptions(ctx, vmID, false, 30) -} - -// ShutdownVMWithOptions shuts down a Docker container with options -func (d *DockerBackend) ShutdownVMWithOptions(ctx context.Context, vmID string, force bool, timeoutSeconds int32) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.shutdown_vm", - trace.WithAttributes( - attribute.String("vm_id", vmID), - attribute.Bool("force", force), - attribute.Int("timeout_seconds", int(timeoutSeconds)), - ), - ) - defer span.End() - - d.mutex.Lock() - vm, exists := d.vmRegistry[vmID] - d.mutex.Unlock() - - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return err - } - - d.logger.LogAttrs(ctx, slog.LevelInfo, "shutting down VM with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - slog.Bool("force", force), - slog.Int("timeout_seconds", int(timeoutSeconds)), - ) - - // Stop container - timeoutInt := int(timeoutSeconds) - if err := d.dockerClient.ContainerStop(ctx, vm.ContainerID, container.StopOptions{Timeout: &timeoutInt}); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to stop container: %w", err) - } - - // Update VM state - d.mutex.Lock() - vm.State = metaldv1.VmState_VM_STATE_SHUTDOWN - d.mutex.Unlock() - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM shutdown successfully with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - return nil -} - -// PauseVM pauses a Docker container -func (d *DockerBackend) PauseVM(ctx context.Context, vmID string) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.pause_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.mutex.Lock() - vm, exists := d.vmRegistry[vmID] - d.mutex.Unlock() - - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return err - } - - d.logger.LogAttrs(ctx, slog.LevelInfo, "pausing VM with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - // Pause container - if err := d.dockerClient.ContainerPause(ctx, vm.ContainerID); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to pause container: %w", err) - } - - // Update VM state - d.mutex.Lock() - vm.State = metaldv1.VmState_VM_STATE_PAUSED - d.mutex.Unlock() - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM paused successfully with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - return nil -} - -// ResumeVM resumes a paused Docker container -func (d *DockerBackend) ResumeVM(ctx context.Context, vmID string) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.resume_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.mutex.Lock() - vm, exists := d.vmRegistry[vmID] - d.mutex.Unlock() - - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return err - } - - d.logger.LogAttrs(ctx, slog.LevelInfo, "resuming VM with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - // Unpause container - if err := d.dockerClient.ContainerUnpause(ctx, vm.ContainerID); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to unpause container: %w", err) - } - - // Update VM state - d.mutex.Lock() - vm.State = metaldv1.VmState_VM_STATE_RUNNING - d.mutex.Unlock() - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM resumed successfully with Docker backend", - slog.String("vm_id", vmID), - slog.String("container_id", vm.ContainerID), - ) - - return nil -} - -// RebootVM restarts a Docker container -func (d *DockerBackend) RebootVM(ctx context.Context, vmID string) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.reboot_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.logger.LogAttrs(ctx, slog.LevelInfo, "rebooting VM with Docker backend", - slog.String("vm_id", vmID), - ) - - // Shutdown the VM - if err := d.ShutdownVMWithOptions(ctx, vmID, false, 30); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to shutdown VM for reboot: %w", err) - } - - // Wait a moment - time.Sleep(1 * time.Second) - - // Boot the VM again - if err := d.BootVM(ctx, vmID); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to boot VM after shutdown: %w", err) - } - - d.logger.LogAttrs(ctx, slog.LevelInfo, "VM rebooted successfully with Docker backend", - slog.String("vm_id", vmID), - ) - - return nil -} - -// GetVMInfo returns information about a VM -func (d *DockerBackend) GetVMInfo(ctx context.Context, vmID string) (*backendtypes.VMInfo, error) { - _, span := d.tracer.Start(ctx, "metald.docker.get_vm_info", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.mutex.RLock() - vm, exists := d.vmRegistry[vmID] - d.mutex.RUnlock() - - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return nil, err - } - - info := &backendtypes.VMInfo{ - Config: vm.Config, - State: vm.State, - NetworkInfo: vm.NetworkInfo, - } - - return info, nil -} - -// GetVMMetrics returns metrics for a VM -func (d *DockerBackend) GetVMMetrics(ctx context.Context, vmID string) (*backendtypes.VMMetrics, error) { - ctx, span := d.tracer.Start(ctx, "metald.docker.get_vm_metrics", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - d.mutex.RLock() - vm, exists := d.vmRegistry[vmID] - d.mutex.RUnlock() - - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return nil, err - } - - // Get container stats - stats, err := d.dockerClient.ContainerStats(ctx, vm.ContainerID, false) - if err != nil { - span.RecordError(err) - return nil, fmt.Errorf("failed to get container stats: %w", err) - } - defer stats.Body.Close() - - // Parse stats - var dockerStats container.StatsResponse - if err := json.NewDecoder(stats.Body).Decode(&dockerStats); err != nil { - span.RecordError(err) - return nil, fmt.Errorf("failed to decode container stats: %w", err) - } - - // Convert to VM metrics - metrics := &backendtypes.VMMetrics{ - Timestamp: time.Now(), - CpuTimeNanos: int64(dockerStats.CPUStats.CPUUsage.TotalUsage), - MemoryUsageBytes: int64(dockerStats.MemoryStats.Usage), - DiskReadBytes: 0, // TODO: Calculate from BlkioStats - DiskWriteBytes: 0, // TODO: Calculate from BlkioStats - NetworkRxBytes: 0, // TODO: Calculate from NetworkStats - NetworkTxBytes: 0, // TODO: Calculate from NetworkStats - } - - // Calculate disk I/O - for _, blkio := range dockerStats.BlkioStats.IoServiceBytesRecursive { - if blkio.Op == "Read" { - metrics.DiskReadBytes += int64(blkio.Value) - } else if blkio.Op == "Write" { - metrics.DiskWriteBytes += int64(blkio.Value) - } - } - - // Calculate network I/O - if dockerStats.Networks != nil { - for _, netStats := range dockerStats.Networks { - metrics.NetworkRxBytes += int64(netStats.RxBytes) - metrics.NetworkTxBytes += int64(netStats.TxBytes) - } - } - - return metrics, nil -} - -// Ping checks if the Docker backend is healthy -func (d *DockerBackend) Ping(ctx context.Context) error { - ctx, span := d.tracer.Start(ctx, "metald.docker.ping") - defer span.End() - - d.logger.DebugContext(ctx, "pinging Docker backend") - - // Ping Docker daemon - if _, err := d.dockerClient.Ping(ctx); err != nil { - span.RecordError(err) - return fmt.Errorf("Docker daemon not available: %w", err) - } - - return nil -} - -// Helper methods - -// generateVMID generates a unique VM ID -func (d *DockerBackend) generateVMID() string { - return fmt.Sprintf("vm-%d", time.Now().UnixNano()) -} - -// vmConfigToContainerSpec converts VM configuration to container specification -func (d *DockerBackend) vmConfigToContainerSpec(ctx context.Context, vmID string, config *metaldv1.VmConfig) (*ContainerSpec, error) { - spec := &ContainerSpec{ - Labels: map[string]string{ - "unkey.vm.id": vmID, - "unkey.vm.created_by": "metald", - }, - Memory: config.GetMemory().GetSizeBytes(), - CPUs: float64(config.GetCpu().GetVcpuCount()), - } - - // Docker image must be specified in metadata - dockerImage, ok := config.Metadata["docker_image"] - if !ok || dockerImage == "" { - return nil, fmt.Errorf("docker_image must be specified in VM config metadata") - } - spec.Image = dockerImage - - // Extract exposed ports from metadata - if exposedPorts, ok := config.Metadata["exposed_ports"]; ok { - ports := strings.Split(exposedPorts, ",") - for _, port := range ports { - if port = strings.TrimSpace(port); port != "" { - spec.ExposedPorts = append(spec.ExposedPorts, port) - } - } - } - - // Extract environment variables from metadata - if envVars, ok := config.Metadata["env_vars"]; ok { - vars := strings.Split(envVars, ",") - for _, envVar := range vars { - if envVar = strings.TrimSpace(envVar); envVar != "" { - spec.Env = append(spec.Env, envVar) - } - } - } - - // Allocate host ports for exposed ports - for _, exposedPort := range spec.ExposedPorts { - containerPort, err := strconv.Atoi(strings.Split(exposedPort, "/")[0]) - if err != nil { - continue - } - - protocol := "tcp" - if strings.Contains(exposedPort, "/udp") { - protocol = "udp" - } - - // We'll allocate the port during container creation with retry logic - spec.PortMappings = append(spec.PortMappings, PortMapping{ - ContainerPort: containerPort, - HostPort: 0, // Will be allocated during creation - Protocol: protocol, - }) - } - - return spec, nil -} - -// createContainer creates a Docker container from the specification -func (d *DockerBackend) createContainer(ctx context.Context, spec *ContainerSpec) (string, error) { - ctx, span := d.tracer.Start(ctx, "metald.docker.create_container", - trace.WithAttributes(attribute.String("image", spec.Image)), - ) - defer span.End() - - d.logger.Info("checking if image exists locally", "image", spec.Image) - _, err := d.dockerClient.ImageInspect(ctx, spec.Image) - if err != nil { - d.logger.Info("image not found locally, pulling image", "image", spec.Image, "error", err.Error()) - pullResponse, err := d.dockerClient.ImagePull(ctx, spec.Image, image.PullOptions{}) - if err != nil { - return "", fmt.Errorf("failed to pull image %s: %w", spec.Image, err) - } - defer pullResponse.Close() - - // Read the pull response to completion to ensure pull finishes - _, err = io.ReadAll(pullResponse) - if err != nil { - return "", fmt.Errorf("failed to read pull response for image %s: %w", spec.Image, err) - } - - d.logger.Info("image pulled successfully", "image", spec.Image) - } else { - d.logger.Info("image found locally, skipping pull", "image", spec.Image) - } - - // Build container configuration - config := &container.Config{ - Image: spec.Image, - Cmd: spec.Cmd, - Env: spec.Env, - Labels: spec.Labels, - ExposedPorts: make(nat.PortSet), - WorkingDir: spec.WorkingDir, - } - - // Log the container command for debugging - d.logger.Info("container configuration", "image", spec.Image, "cmd", config.Cmd, "env", config.Env) - - // Set up exposed ports - for _, mapping := range spec.PortMappings { - port := nat.Port(fmt.Sprintf("%d/%s", mapping.ContainerPort, mapping.Protocol)) - config.ExposedPorts[port] = struct{}{} - } - - // Build host configuration - hostConfig := &container.HostConfig{ - PortBindings: make(nat.PortMap), - AutoRemove: false, // Don't auto-remove containers for debugging - Privileged: d.config.Privileged, - Resources: container.Resources{ - Memory: spec.Memory, - NanoCPUs: int64(spec.CPUs * 1e9), - }, - } - - // Set up port bindings with retry logic - maxRetries := 5 - for retry := 0; retry < maxRetries; retry++ { - // Clear previous port bindings - hostConfig.PortBindings = make(nat.PortMap) - - // Allocate ports for this attempt - var allocatedPorts []int - portAllocationFailed := false - - for i, mapping := range spec.PortMappings { - if mapping.HostPort == 0 { - // Allocate a new port - hostPort, err := d.portAllocator.allocatePort(spec.Labels["unkey.vm.id"]) - if err != nil { - // Release any ports allocated in this attempt - for _, port := range allocatedPorts { - d.portAllocator.releasePort(port, spec.Labels["unkey.vm.id"]) - } - portAllocationFailed = true - break - } - spec.PortMappings[i].HostPort = hostPort - allocatedPorts = append(allocatedPorts, hostPort) - } - - containerPort := nat.Port(fmt.Sprintf("%d/%s", mapping.ContainerPort, mapping.Protocol)) - hostConfig.PortBindings[containerPort] = []nat.PortBinding{ - { - HostIP: "0.0.0.0", - HostPort: strconv.Itoa(spec.PortMappings[i].HostPort), - }, - } - } - - if portAllocationFailed { - continue // Try again with new ports - } - - // Create container - containerName := d.config.ContainerPrefix + spec.Labels["unkey.vm.id"] - resp, err := d.dockerClient.ContainerCreate(ctx, config, hostConfig, nil, nil, containerName) - if err != nil { - // If it's a port binding error, release ports and try again - if strings.Contains(err.Error(), "port is already allocated") || strings.Contains(err.Error(), "bind") { - for _, port := range allocatedPorts { - d.portAllocator.releasePort(port, spec.Labels["unkey.vm.id"]) - } - d.logger.Warn("port binding failed, retrying with new ports", "error", err, "retry", retry+1) - continue - } - // Other errors are not retryable - span.RecordError(err) - return "", fmt.Errorf("failed to create container: %w", err) - } - - // Success! - span.SetAttributes(attribute.String("container_id", resp.ID)) - return resp.ID, nil - } - - // If we get here, all retries failed - return "", fmt.Errorf("failed to create container after %d retries due to port conflicts", maxRetries) -} - -// getContainerNetworkInfo gets network information for a container -func (d *DockerBackend) getContainerNetworkInfo(ctx context.Context, containerID string) (*metaldv1.VmNetworkInfo, error) { - ctx, span := d.tracer.Start(ctx, "metald.docker.get_network_info", - trace.WithAttributes(attribute.String("container_id", containerID)), - ) - defer span.End() - - // Inspect container - inspect, err := d.dockerClient.ContainerInspect(ctx, containerID) - if err != nil { - span.RecordError(err) - return nil, fmt.Errorf("failed to inspect container: %w", err) - } - - // Get network info from default network - var networkInfo *metaldv1.VmNetworkInfo - if inspect.NetworkSettings != nil && inspect.NetworkSettings.Networks != nil { - for networkName, network := range inspect.NetworkSettings.Networks { - if network.IPAddress != "" { - networkInfo = &metaldv1.VmNetworkInfo{ - IpAddress: network.IPAddress, - MacAddress: network.MacAddress, - TapDevice: networkName, // Use network name as tap device - } - break - } - } - } - - // Add port mappings from container inspect - if inspect.NetworkSettings != nil && inspect.NetworkSettings.Ports != nil { - var portMappings []*metaldv1.PortMapping - for containerPort, hostBindings := range inspect.NetworkSettings.Ports { - if len(hostBindings) > 0 { - // Parse container port (e.g., "3000/tcp" -> 3000) - portStr := strings.Split(string(containerPort), "/")[0] - containerPortNum, err := strconv.Atoi(portStr) - if err != nil { - continue - } - - // Get protocol (tcp/udp) - protocol := "tcp" - if strings.Contains(string(containerPort), "/udp") { - protocol = "udp" - } - - // Add mapping for each host binding - for _, hostBinding := range hostBindings { - hostPortNum, err := strconv.Atoi(hostBinding.HostPort) - if err != nil { - continue - } - - portMappings = append(portMappings, &metaldv1.PortMapping{ - ContainerPort: int32(containerPortNum), - HostPort: int32(hostPortNum), - Protocol: protocol, - }) - } - } - } - - // Initialize networkInfo if it doesn't exist - if networkInfo == nil { - networkInfo = &metaldv1.VmNetworkInfo{} - } - networkInfo.PortMappings = portMappings - } - - return networkInfo, nil -} - -// Port allocator methods - -// allocatePort allocates a port for a VM -func (pa *portAllocator) allocatePort(vmID string) (int, error) { - pa.mutex.Lock() - defer pa.mutex.Unlock() - - // Try random ports to avoid conflicts - maxAttempts := 100 - for attempt := 0; attempt < maxAttempts; attempt++ { - port := rand.Intn(pa.maxPort-pa.minPort+1) + pa.minPort - if _, exists := pa.allocated[port]; !exists { - pa.allocated[port] = vmID - return port, nil - } - } - - return 0, fmt.Errorf("no available ports in range %d-%d after %d attempts", pa.minPort, pa.maxPort, maxAttempts) -} - -// releasePort releases a port from a VM -func (pa *portAllocator) releasePort(port int, vmID string) { - pa.mutex.Lock() - defer pa.mutex.Unlock() - - if allocated, exists := pa.allocated[port]; exists && allocated == vmID { - delete(pa.allocated, port) - } -} - - -// Ensure DockerBackend implements Backend interface -var _ backendtypes.Backend = (*DockerBackend)(nil) - -// AIDEV-NOTE: Docker backend implementation provides a complete replacement -// for the Firecracker backend, using Docker containers instead of VMs. -// Key advantages: -// 1. Simplified networking - Docker handles port mapping automatically -// 2. No privileged operations - Docker daemon handles isolation -// 3. Familiar container semantics - easier debugging and monitoring -// 4. Fast startup times - containers start instantly vs VM boot time -// 5. Resource efficiency - shared kernel, no VM overhead +// import ( +// "context" +// "crypto/rand" +// "encoding/json" +// "fmt" +// "io" +// "log/slog" +// "math/big" +// "strconv" +// "strings" +// "sync" +// "time" + +// "github.com/docker/docker/api/types/container" +// "github.com/docker/docker/api/types/image" +// "github.com/docker/docker/client" +// "github.com/docker/go-connections/nat" +// backendtypes "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" +// metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" +// "go.opentelemetry.io/otel" +// "go.opentelemetry.io/otel/attribute" +// "go.opentelemetry.io/otel/metric" +// "go.opentelemetry.io/otel/trace" +// ) + +// // safeUint64ToInt64 safely converts uint64 to int64, clamping to max int64 on overflow +// func safeUint64ToInt64(val uint64) int64 { +// const maxInt64 = int64(^uint64(0) >> 1) // 2^63 - 1 +// if val > uint64(maxInt64) { +// return maxInt64 // Clamp to max int64 instead of overflowing +// } +// return int64(val) +// } + +// // safeIntToInt32 safely converts int to int32, clamping to int32 bounds on overflow +// func safeIntToInt32(val int) int32 { +// const maxInt32 = 2147483647 // 2^31 - 1 +// const minInt32 = -2147483648 // -2^31 +// if val > maxInt32 { +// return maxInt32 +// } +// if val < minInt32 { +// return minInt32 +// } +// return int32(val) +// } + +// // DockerBackend implements the Backend interface using Docker containers +// type DockerBackend struct { +// logger *slog.Logger +// dockerClient *client.Client +// config *DockerBackendConfig +// vmRegistry map[string]*dockerVM +// portAllocator *portAllocator +// mutex sync.RWMutex +// tracer trace.Tracer +// meter metric.Meter +// vmCreateCounter metric.Int64Counter +// vmDeleteCounter metric.Int64Counter +// vmBootCounter metric.Int64Counter +// vmErrorCounter metric.Int64Counter +// } + +// // portAllocator manages port allocation for containers +// type portAllocator struct { +// mutex sync.Mutex +// allocated map[int]string // port -> vmID +// minPort int +// maxPort int +// } + +// // NewDockerBackend creates a new Docker backend +// func NewDockerBackend(logger *slog.Logger, config *DockerBackendConfig) (*DockerBackend, error) { +// if config == nil { +// config = DefaultDockerBackendConfig() +// } + +// // Create Docker client +// dockerClient, err := client.NewClientWithOpts( +// client.FromEnv, +// client.WithAPIVersionNegotiation(), +// ) +// if err != nil { +// return nil, fmt.Errorf("failed to create Docker client: %w", err) +// } + +// // Verify Docker connection +// ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) +// defer cancel() + +// if _, pingErr := dockerClient.Ping(ctx); pingErr != nil { +// return nil, fmt.Errorf("failed to connect to Docker daemon: %w", pingErr) +// } + +// // Set up telemetry +// tracer := otel.Tracer("metald.docker.backend") +// meter := otel.Meter("metald.docker.backend") + +// vmCreateCounter, err := meter.Int64Counter("vm_create_total", +// metric.WithDescription("Total number of VM create operations"), +// metric.WithUnit("1"), +// ) +// if err != nil { +// return nil, fmt.Errorf("failed to create vm_create counter: %w", err) +// } + +// vmDeleteCounter, err := meter.Int64Counter("vm_delete_total", +// metric.WithDescription("Total number of VM delete operations"), +// metric.WithUnit("1"), +// ) +// if err != nil { +// return nil, fmt.Errorf("failed to create vm_delete counter: %w", err) +// } + +// vmBootCounter, err := meter.Int64Counter("vm_boot_total", +// metric.WithDescription("Total number of VM boot operations"), +// metric.WithUnit("1"), +// ) +// if err != nil { +// return nil, fmt.Errorf("failed to create vm_boot counter: %w", err) +// } + +// vmErrorCounter, err := meter.Int64Counter("vm_error_total", +// metric.WithDescription("Total number of VM operation errors"), +// metric.WithUnit("1"), +// ) +// if err != nil { +// return nil, fmt.Errorf("failed to create vm_error counter: %w", err) +// } + +// // Create port allocator +// portAllocator := &portAllocator{ +// allocated: make(map[int]string), +// minPort: config.PortRange.Min, +// maxPort: config.PortRange.Max, +// } + +// backend := &DockerBackend{ +// logger: logger.With("backend", "docker"), +// dockerClient: dockerClient, +// config: config, +// vmRegistry: make(map[string]*dockerVM), +// portAllocator: portAllocator, +// tracer: tracer, +// meter: meter, +// vmCreateCounter: vmCreateCounter, +// vmDeleteCounter: vmDeleteCounter, +// vmBootCounter: vmBootCounter, +// vmErrorCounter: vmErrorCounter, +// } + +// return backend, nil +// } + +// // CreateVM creates a new Docker container representing a VM +// func (d *DockerBackend) CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) { +// ctx, span := d.tracer.Start(ctx, "metald.docker.create_vm", +// trace.WithAttributes( +// attribute.Int("vcpus", int(config.GetVcpuCount())), +// attribute.Int64("memory_bytes", int64(config.GetMemorySizeMib())), +// ), +// ) +// defer span.End() + +// // Generate VM ID +// vmID := d.generateVMID() +// span.SetAttributes(attribute.String("vm_id", vmID)) + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "creating VM with Docker backend", +// slog.String("vm_id", vmID), +// slog.Int("vcpus", int(config.GetVcpuCount())), +// slog.Int64("memory_bytes", int64(config.GetMemorySizeMib())), +// ) + +// // Convert VM config to container spec +// containerSpec, err := d.vmConfigToContainerSpec(ctx, vmID, config) +// if err != nil { +// span.RecordError(err) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "create"), +// attribute.String("error", "config_conversion"), +// )) +// return "", fmt.Errorf("failed to convert VM config to container spec: %w", err) +// } + +// // Create container +// containerID, err := d.createContainer(ctx, containerSpec) +// if err != nil { +// span.RecordError(err) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "create"), +// attribute.String("error", "container_creation"), +// )) +// return "", fmt.Errorf("failed to create container: %w", err) +// } + +// // Register VM +// d.mutex.Lock() +// vm := &dockerVM{ +// ID: vmID, +// ContainerID: containerID, +// Config: config, +// State: metaldv1.VmState_VM_STATE_CREATED, +// PortMappings: containerSpec.PortMappings, +// CreatedAt: time.Now(), +// } +// d.vmRegistry[vmID] = vm +// d.mutex.Unlock() + +// d.vmCreateCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("status", "success"), +// )) + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM created successfully with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", containerID), +// ) + +// return vmID, nil +// } + +// // BootVM starts a Docker container representing a VM +// func (d *DockerBackend) BootVM(ctx context.Context, vmID string) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.boot_vm", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.mutex.Lock() +// vm, exists := d.vmRegistry[vmID] +// d.mutex.Unlock() + +// if !exists { +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "boot"), +// attribute.String("error", "vm_not_found"), +// )) +// return err +// } + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "booting VM with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// // Check if container still exists before starting +// _, err := d.dockerClient.ContainerInspect(ctx, vm.ContainerID) +// if err != nil { +// span.RecordError(err) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "boot"), +// attribute.String("error", "container_not_found"), +// )) +// return fmt.Errorf("container not found before start: %w", err) +// } + +// // Start container +// if startErr := d.dockerClient.ContainerStart(ctx, vm.ContainerID, container.StartOptions{}); startErr != nil { +// span.RecordError(startErr) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "boot"), +// attribute.String("error", "container_start"), +// )) +// return fmt.Errorf("failed to start container: %w", startErr) +// } + +// // Update VM state and network info +// d.mutex.Lock() +// vm.State = metaldv1.VmState_VM_STATE_RUNNING + +// // Get container network info +// // networkInfo, err := d.getContainerNetworkInfo(ctx, vm.ContainerID) +// if err != nil { +// d.logger.WarnContext(ctx, "failed to get container network info", +// "vm_id", vmID, +// "container_id", vm.ContainerID, +// "error", err, +// ) +// } else { +// // vm.NetworkInfo = networkInfo +// } +// d.mutex.Unlock() + +// d.vmBootCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("status", "success"), +// )) + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM booted successfully with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// return nil +// } + +// // DeleteVM removes a Docker container representing a VM +// func (d *DockerBackend) DeleteVM(ctx context.Context, vmID string) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.delete_vm", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.mutex.Lock() +// vm, exists := d.vmRegistry[vmID] +// if !exists { +// d.mutex.Unlock() +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "delete"), +// attribute.String("error", "vm_not_found"), +// )) +// return err +// } + +// // Release allocated ports +// for _, mapping := range vm.PortMappings { +// d.portAllocator.releasePort(mapping.HostPort, vmID) +// } + +// delete(d.vmRegistry, vmID) +// d.mutex.Unlock() + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "deleting VM with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// // Remove container (force remove) +// if err := d.dockerClient.ContainerRemove(ctx, vm.ContainerID, container.RemoveOptions{ +// Force: true, +// }); err != nil { +// span.RecordError(err) +// d.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("operation", "delete"), +// attribute.String("error", "container_removal"), +// )) +// return fmt.Errorf("failed to remove container: %w", err) +// } + +// d.vmDeleteCounter.Add(ctx, 1, metric.WithAttributes( +// attribute.String("status", "success"), +// )) + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM deleted successfully with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// return nil +// } + +// // ShutdownVM gracefully shuts down a Docker container +// func (d *DockerBackend) ShutdownVM(ctx context.Context, vmID string) error { +// return d.ShutdownVMWithOptions(ctx, vmID, false, 30) +// } + +// // ShutdownVMWithOptions shuts down a Docker container with options +// func (d *DockerBackend) ShutdownVMWithOptions(ctx context.Context, vmID string, force bool, timeoutSeconds int32) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.shutdown_vm", +// trace.WithAttributes( +// attribute.String("vm_id", vmID), +// attribute.Bool("force", force), +// attribute.Int("timeout_seconds", int(timeoutSeconds)), +// ), +// ) +// defer span.End() + +// d.mutex.Lock() +// vm, exists := d.vmRegistry[vmID] +// d.mutex.Unlock() + +// if !exists { +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// return err +// } + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "shutting down VM with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// slog.Bool("force", force), +// slog.Int("timeout_seconds", int(timeoutSeconds)), +// ) + +// // Stop container +// timeoutInt := int(timeoutSeconds) +// if err := d.dockerClient.ContainerStop(ctx, vm.ContainerID, container.StopOptions{Timeout: &timeoutInt}); err != nil { +// span.RecordError(err) +// return fmt.Errorf("failed to stop container: %w", err) +// } + +// // Update VM state +// d.mutex.Lock() +// vm.State = metaldv1.VmState_VM_STATE_SHUTDOWN +// d.mutex.Unlock() + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM shutdown successfully with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// return nil +// } + +// // PauseVM pauses a Docker container +// func (d *DockerBackend) PauseVM(ctx context.Context, vmID string) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.pause_vm", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.mutex.Lock() +// vm, exists := d.vmRegistry[vmID] +// d.mutex.Unlock() + +// if !exists { +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// return err +// } + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "pausing VM with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// // Pause container +// if err := d.dockerClient.ContainerPause(ctx, vm.ContainerID); err != nil { +// span.RecordError(err) +// return fmt.Errorf("failed to pause container: %w", err) +// } + +// // Update VM state +// d.mutex.Lock() +// vm.State = metaldv1.VmState_VM_STATE_PAUSED +// d.mutex.Unlock() + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM paused successfully with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// return nil +// } + +// // ResumeVM resumes a paused Docker container +// func (d *DockerBackend) ResumeVM(ctx context.Context, vmID string) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.resume_vm", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.mutex.Lock() +// vm, exists := d.vmRegistry[vmID] +// d.mutex.Unlock() + +// if !exists { +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// return err +// } + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "resuming VM with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// // Unpause container +// if err := d.dockerClient.ContainerUnpause(ctx, vm.ContainerID); err != nil { +// span.RecordError(err) +// return fmt.Errorf("failed to unpause container: %w", err) +// } + +// // Update VM state +// d.mutex.Lock() +// vm.State = metaldv1.VmState_VM_STATE_RUNNING +// d.mutex.Unlock() + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM resumed successfully with Docker backend", +// slog.String("vm_id", vmID), +// slog.String("container_id", vm.ContainerID), +// ) + +// return nil +// } + +// // RebootVM restarts a Docker container +// func (d *DockerBackend) RebootVM(ctx context.Context, vmID string) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.reboot_vm", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "rebooting VM with Docker backend", +// slog.String("vm_id", vmID), +// ) + +// // Shutdown the VM +// if err := d.ShutdownVMWithOptions(ctx, vmID, false, 30); err != nil { +// span.RecordError(err) +// return fmt.Errorf("failed to shutdown VM for reboot: %w", err) +// } + +// // Wait a moment +// time.Sleep(1 * time.Second) + +// // Boot the VM again +// if err := d.BootVM(ctx, vmID); err != nil { +// span.RecordError(err) +// return fmt.Errorf("failed to boot VM after shutdown: %w", err) +// } + +// d.logger.LogAttrs(ctx, slog.LevelInfo, "VM rebooted successfully with Docker backend", +// slog.String("vm_id", vmID), +// ) + +// return nil +// } + +// // GetVMInfo returns information about a VM +// func (d *DockerBackend) GetVMInfo(ctx context.Context, vmID string) (*backendtypes.VMInfo, error) { +// _, span := d.tracer.Start(ctx, "metald.docker.get_vm_info", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.mutex.RLock() +// vm, exists := d.vmRegistry[vmID] +// d.mutex.RUnlock() + +// if !exists { +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// return nil, err +// } + +// info := &backendtypes.VMInfo{ +// Config: vm.Config, +// State: vm.State, +// // NetworkInfo: vm.NetworkInfo, +// } + +// return info, nil +// } + +// // GetVMMetrics returns metrics for a VM +// func (d *DockerBackend) GetVMMetrics(ctx context.Context, vmID string) (*backendtypes.VMMetrics, error) { +// ctx, span := d.tracer.Start(ctx, "metald.docker.get_vm_metrics", +// trace.WithAttributes(attribute.String("vm_id", vmID)), +// ) +// defer span.End() + +// d.mutex.RLock() +// vm, exists := d.vmRegistry[vmID] +// d.mutex.RUnlock() + +// if !exists { +// err := fmt.Errorf("vm %s not found", vmID) +// span.RecordError(err) +// return nil, err +// } + +// // Get container stats +// stats, err := d.dockerClient.ContainerStats(ctx, vm.ContainerID, false) +// if err != nil { +// span.RecordError(err) +// return nil, fmt.Errorf("failed to get container stats: %w", err) +// } +// defer stats.Body.Close() + +// // Parse stats +// var dockerStats container.StatsResponse +// if err := json.NewDecoder(stats.Body).Decode(&dockerStats); err != nil { +// span.RecordError(err) +// return nil, fmt.Errorf("failed to decode container stats: %w", err) +// } + +// // Convert to VM metrics +// metrics := &backendtypes.VMMetrics{ +// Timestamp: time.Now(), +// CpuTimeNanos: safeUint64ToInt64(dockerStats.CPUStats.CPUUsage.TotalUsage), +// MemoryUsageBytes: safeUint64ToInt64(dockerStats.MemoryStats.Usage), +// DiskReadBytes: 0, // TODO: Calculate from BlkioStats +// DiskWriteBytes: 0, // TODO: Calculate from BlkioStats +// NetworkRxBytes: 0, // TODO: Calculate from NetworkStats +// NetworkTxBytes: 0, // TODO: Calculate from NetworkStats +// } + +// // Calculate disk I/O +// for _, blkio := range dockerStats.BlkioStats.IoServiceBytesRecursive { +// if blkio.Op == "Read" { +// metrics.DiskReadBytes += safeUint64ToInt64(blkio.Value) +// } else if blkio.Op == "Write" { +// metrics.DiskWriteBytes += safeUint64ToInt64(blkio.Value) +// } +// } + +// // Calculate network I/O +// if dockerStats.Networks != nil { +// for _, netStats := range dockerStats.Networks { +// metrics.NetworkRxBytes += safeUint64ToInt64(netStats.RxBytes) +// metrics.NetworkTxBytes += safeUint64ToInt64(netStats.TxBytes) +// } +// } + +// return metrics, nil +// } + +// // Ping checks if the Docker backend is healthy +// func (d *DockerBackend) Ping(ctx context.Context) error { +// ctx, span := d.tracer.Start(ctx, "metald.docker.ping") +// defer span.End() + +// d.logger.DebugContext(ctx, "pinging Docker backend") + +// // Ping Docker daemon +// if _, err := d.dockerClient.Ping(ctx); err != nil { +// span.RecordError(err) +// return fmt.Errorf("Docker daemon not available: %w", err) +// } + +// return nil +// } + +// // Helper methods + +// // generateVMID generates a unique VM ID +// func (d *DockerBackend) generateVMID() string { +// return fmt.Sprintf("vm-%d", time.Now().UnixNano()) +// } + +// // vmConfigToContainerSpec converts VM configuration to container specification +// func (d *DockerBackend) vmConfigToContainerSpec(ctx context.Context, vmID string, config *metaldv1.VmConfig) (*ContainerSpec, error) { +// d.logger.DebugContext(ctx, "converting VM config to container spec", "vm_id", vmID) + +// spec := &ContainerSpec{ +// Labels: map[string]string{ +// "unkey.vm.id": vmID, +// "unkey.vm.created_by": "metald", +// }, +// Memory: int64(config.GetMemorySizeMib()), +// CPUs: float64(config.GetVcpuCount()), +// } + +// // Docker image must be specified in metadata +// dockerImage, ok := config.GetMetadata()["docker_image"] +// if !ok || dockerImage == "" { +// return nil, fmt.Errorf("docker_image must be specified in VM config metadata") +// } +// spec.Image = dockerImage + +// // Extract exposed ports from metadata +// if exposedPorts, ok := config.GetMetadata()["exposed_ports"]; ok { +// ports := strings.Split(exposedPorts, ",") +// for _, port := range ports { +// if port = strings.TrimSpace(port); port != "" { +// spec.ExposedPorts = append(spec.ExposedPorts, port) +// } +// } +// } + +// // Extract environment variables from metadata +// if envVars, ok := config.GetMetadata()["env_vars"]; ok { +// vars := strings.Split(envVars, ",") +// for _, envVar := range vars { +// if envVar = strings.TrimSpace(envVar); envVar != "" { +// spec.Env = append(spec.Env, envVar) +// } +// } +// } + +// // Allocate host ports for exposed ports +// for _, exposedPort := range spec.ExposedPorts { +// containerPort, err := strconv.Atoi(strings.Split(exposedPort, "/")[0]) +// if err != nil { +// continue +// } + +// protocol := "tcp" +// if strings.Contains(exposedPort, "/udp") { +// protocol = "udp" +// } + +// // We'll allocate the port during container creation with retry logic +// spec.PortMappings = append(spec.PortMappings, PortMapping{ +// ContainerPort: containerPort, +// HostPort: 0, // Will be allocated during creation +// Protocol: protocol, +// }) +// } + +// return spec, nil +// } + +// // createContainer creates a Docker container from the specification +// func (d *DockerBackend) createContainer(ctx context.Context, spec *ContainerSpec) (string, error) { +// ctx, span := d.tracer.Start(ctx, "metald.docker.create_container", +// trace.WithAttributes(attribute.String("image", spec.Image)), +// ) +// defer span.End() + +// d.logger.InfoContext(ctx, "checking if image exists locally", "image", spec.Image) +// _, err := d.dockerClient.ImageInspect(ctx, spec.Image) +// if err != nil { +// d.logger.InfoContext(ctx, "image not found locally, pulling image", "image", spec.Image, "error", err.Error()) +// pullResponse, err := d.dockerClient.ImagePull(ctx, spec.Image, image.PullOptions{}) +// if err != nil { +// return "", fmt.Errorf("failed to pull image %s: %w", spec.Image, err) +// } +// defer pullResponse.Close() + +// // Read the pull response to completion to ensure pull finishes +// _, err = io.ReadAll(pullResponse) +// if err != nil { +// return "", fmt.Errorf("failed to read pull response for image %s: %w", spec.Image, err) +// } + +// d.logger.InfoContext(ctx, "image pulled successfully", "image", spec.Image) +// } else { +// d.logger.InfoContext(ctx, "image found locally, skipping pull", "image", spec.Image) +// } + +// // Build container configuration +// config := &container.Config{ +// Image: spec.Image, +// Cmd: spec.Cmd, +// Env: spec.Env, +// Labels: spec.Labels, +// ExposedPorts: make(nat.PortSet), +// WorkingDir: spec.WorkingDir, +// } + +// // Log the container command for debugging +// d.logger.InfoContext(ctx, "container configuration", "image", spec.Image, "cmd", config.Cmd, "env", config.Env) + +// // Set up exposed ports +// for _, mapping := range spec.PortMappings { +// port := nat.Port(fmt.Sprintf("%d/%s", mapping.ContainerPort, mapping.Protocol)) +// config.ExposedPorts[port] = struct{}{} +// } + +// // Build host configuration +// hostConfig := &container.HostConfig{ +// PortBindings: make(nat.PortMap), +// AutoRemove: false, // Don't auto-remove containers for debugging +// Privileged: d.config.Privileged, +// Resources: container.Resources{ +// Memory: spec.Memory, +// NanoCPUs: int64(spec.CPUs * 1e9), +// }, +// } + +// // Set up port bindings with retry logic +// maxRetries := 5 +// for retry := 0; retry < maxRetries; retry++ { +// // Clear previous port bindings +// hostConfig.PortBindings = make(nat.PortMap) + +// // Allocate ports for this attempt +// var allocatedPorts []int +// portAllocationFailed := false + +// for i, mapping := range spec.PortMappings { +// if mapping.HostPort == 0 { +// // Allocate a new port +// hostPort, err := d.portAllocator.allocatePort(spec.Labels["unkey.vm.id"]) +// if err != nil { +// // Release any ports allocated in this attempt +// for _, port := range allocatedPorts { +// d.portAllocator.releasePort(port, spec.Labels["unkey.vm.id"]) +// } +// portAllocationFailed = true +// break +// } +// spec.PortMappings[i].HostPort = hostPort +// allocatedPorts = append(allocatedPorts, hostPort) +// } + +// containerPort := nat.Port(fmt.Sprintf("%d/%s", mapping.ContainerPort, mapping.Protocol)) +// hostConfig.PortBindings[containerPort] = []nat.PortBinding{ +// { +// HostIP: "0.0.0.0", +// HostPort: strconv.Itoa(spec.PortMappings[i].HostPort), +// }, +// } +// } + +// if portAllocationFailed { +// continue // Try again with new ports +// } + +// // Create container +// containerName := d.config.ContainerPrefix + spec.Labels["unkey.vm.id"] +// resp, err := d.dockerClient.ContainerCreate(ctx, config, hostConfig, nil, nil, containerName) +// if err != nil { +// // If it's a port binding error, release ports and try again +// if strings.Contains(err.Error(), "port is already allocated") || strings.Contains(err.Error(), "bind") { +// for _, port := range allocatedPorts { +// d.portAllocator.releasePort(port, spec.Labels["unkey.vm.id"]) +// } +// d.logger.WarnContext(ctx, "port binding failed, retrying with new ports", "error", err, "retry", retry+1) +// continue +// } +// // Other errors are not retryable +// span.RecordError(err) +// return "", fmt.Errorf("failed to create container: %w", err) +// } + +// // Success! +// span.SetAttributes(attribute.String("container_id", resp.ID)) +// return resp.ID, nil +// } + +// // If we get here, all retries failed +// return "", fmt.Errorf("failed to create container after %d retries due to port conflicts", maxRetries) +// } + +// // getContainerNetworkInfo gets network information for a container +// func (d *DockerBackend) getContainerNetworkInfo(ctx context.Context, containerID string) (*metaldv1.VmNetworkInfo, error) { +// ctx, span := d.tracer.Start(ctx, "metald.docker.get_network_info", +// trace.WithAttributes(attribute.String("container_id", containerID)), +// ) +// defer span.End() + +// // Inspect container +// inspect, err := d.dockerClient.ContainerInspect(ctx, containerID) +// if err != nil { +// span.RecordError(err) +// return nil, fmt.Errorf("failed to inspect container: %w", err) +// } + +// // Get network info from default network +// var networkInfo *metaldv1.VmNetworkInfo +// if inspect.NetworkSettings != nil && inspect.NetworkSettings.Networks != nil { +// for networkName, network := range inspect.NetworkSettings.Networks { +// if network.IPAddress != "" { +// networkInfo = &metaldv1.VmNetworkInfo{ +// IpAddress: network.IPAddress, +// MacAddress: network.MacAddress, +// TapDevice: networkName, // Use network name as tap device +// } +// break +// } +// } +// } + +// // Add port mappings from container inspect +// if inspect.NetworkSettings != nil && inspect.NetworkSettings.Ports != nil { +// var portMappings []*metaldv1.PortMapping +// for containerPort, hostBindings := range inspect.NetworkSettings.Ports { +// if len(hostBindings) > 0 { +// // Parse container port (e.g., "3000/tcp" -> 3000) +// portStr := strings.Split(string(containerPort), "/")[0] +// containerPortNum, err := strconv.Atoi(portStr) +// if err != nil { +// continue +// } + +// // Get protocol (tcp/udp) +// protocol := "tcp" +// if strings.Contains(string(containerPort), "/udp") { +// protocol = "udp" +// } + +// // Add mapping for each host binding +// for _, hostBinding := range hostBindings { +// hostPortNum, err := strconv.Atoi(hostBinding.HostPort) +// if err != nil { +// continue +// } + +// portMappings = append(portMappings, &metaldv1.PortMapping{ +// ContainerPort: safeIntToInt32(containerPortNum), +// HostPort: safeIntToInt32(hostPortNum), +// Protocol: protocol, +// }) +// } +// } +// } + +// // Initialize networkInfo if it doesn't exist +// if networkInfo == nil { +// networkInfo = &metaldv1.VmNetworkInfo{} +// } +// networkInfo.PortMappings = portMappings +// } + +// return networkInfo, nil +// } + +// // Port allocator methods + +// // allocatePort allocates a port for a VM +// func (pa *portAllocator) allocatePort(vmID string) (int, error) { +// pa.mutex.Lock() +// defer pa.mutex.Unlock() + +// // Try random ports to avoid conflicts using crypto/rand for security +// maxAttempts := 100 +// for attempt := 0; attempt < maxAttempts; attempt++ { +// // Generate cryptographically secure random number +// portRange := int64(pa.maxPort - pa.minPort + 1) +// randomOffset, err := rand.Int(rand.Reader, big.NewInt(portRange)) +// if err != nil { +// // Fallback to sequential allocation if crypto/rand fails +// for port := pa.minPort; port <= pa.maxPort; port++ { +// if _, exists := pa.allocated[port]; !exists { +// pa.allocated[port] = vmID +// return port, nil +// } +// } +// return 0, fmt.Errorf("failed to generate random port and no sequential ports available: %w", err) +// } + +// port := int(randomOffset.Int64()) + pa.minPort +// if _, exists := pa.allocated[port]; !exists { +// pa.allocated[port] = vmID +// return port, nil +// } +// } + +// return 0, fmt.Errorf("no available ports in range %d-%d after %d attempts", pa.minPort, pa.maxPort, maxAttempts) +// } + +// // releasePort releases a port from a VM +// func (pa *portAllocator) releasePort(port int, vmID string) { +// pa.mutex.Lock() +// defer pa.mutex.Unlock() + +// if allocated, exists := pa.allocated[port]; exists && allocated == vmID { +// delete(pa.allocated, port) +// } +// } + +// // Ensure DockerBackend implements Backend interface +// var _ backendtypes.Backend = (*DockerBackend)(nil) diff --git a/go/deploy/metald/internal/backend/docker/metrics.go b/go/deploy/metald/internal/backend/docker/metrics.go index 24f81cd937..eca06953e2 100644 --- a/go/deploy/metald/internal/backend/docker/metrics.go +++ b/go/deploy/metald/internal/backend/docker/metrics.go @@ -1,324 +1,324 @@ package docker -import ( - "context" - "encoding/json" - "fmt" - "log/slog" - "time" - - "github.com/docker/docker/api/types/container" - "github.com/docker/docker/client" - backendtypes "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/trace" -) - -// MetricsCollector collects metrics from Docker containers -type MetricsCollector struct { - logger *slog.Logger - dockerClient *client.Client - tracer trace.Tracer -} - -// NewMetricsCollector creates a new metrics collector -func NewMetricsCollector(logger *slog.Logger, dockerClient *client.Client, tracer trace.Tracer) *MetricsCollector { - return &MetricsCollector{ - logger: logger.With("component", "docker-metrics"), - dockerClient: dockerClient, - tracer: tracer, - } -} - -// CollectMetrics collects metrics for a specific container -func (mc *MetricsCollector) CollectMetrics(ctx context.Context, containerID string) (*backendtypes.VMMetrics, error) { - ctx, span := mc.tracer.Start(ctx, "metald.docker.collect_metrics", - trace.WithAttributes(attribute.String("container_id", containerID)), - ) - defer span.End() - - // Get container stats (single read, not streaming) - stats, err := mc.dockerClient.ContainerStats(ctx, containerID, false) - if err != nil { - span.RecordError(err) - return nil, fmt.Errorf("failed to get container stats: %w", err) - } - defer stats.Body.Close() - - // Parse stats JSON - var dockerStats container.StatsResponse - if err := json.NewDecoder(stats.Body).Decode(&dockerStats); err != nil { - span.RecordError(err) - return nil, fmt.Errorf("failed to decode container stats: %w", err) - } - - // Convert to VM metrics - metrics := mc.convertDockerStatsToVMMetrics(&dockerStats) - - mc.logger.DebugContext(ctx, "collected container metrics", - slog.String("container_id", containerID), - slog.Int64("cpu_time_nanos", metrics.CpuTimeNanos), - slog.Int64("memory_usage_bytes", metrics.MemoryUsageBytes), - slog.Int64("disk_read_bytes", metrics.DiskReadBytes), - slog.Int64("disk_write_bytes", metrics.DiskWriteBytes), - slog.Int64("network_rx_bytes", metrics.NetworkRxBytes), - slog.Int64("network_tx_bytes", metrics.NetworkTxBytes), - ) - - return metrics, nil -} - -// convertDockerStatsToVMMetrics converts Docker stats to VM metrics format -func (mc *MetricsCollector) convertDockerStatsToVMMetrics(stats *container.StatsResponse) *backendtypes.VMMetrics { - metrics := &backendtypes.VMMetrics{ - Timestamp: time.Now(), - CpuTimeNanos: 0, - MemoryUsageBytes: 0, - DiskReadBytes: 0, - DiskWriteBytes: 0, - NetworkRxBytes: 0, - NetworkTxBytes: 0, - } - - // CPU metrics - if stats.CPUStats.CPUUsage.TotalUsage > 0 { - metrics.CpuTimeNanos = int64(stats.CPUStats.CPUUsage.TotalUsage) - } - - // Memory metrics - if stats.MemoryStats.Usage > 0 { - metrics.MemoryUsageBytes = int64(stats.MemoryStats.Usage) - } - - // Disk I/O metrics - if stats.BlkioStats.IoServiceBytesRecursive != nil { - for _, blkio := range stats.BlkioStats.IoServiceBytesRecursive { - switch blkio.Op { - case "Read": - metrics.DiskReadBytes += int64(blkio.Value) - case "Write": - metrics.DiskWriteBytes += int64(blkio.Value) - } - } - } - - // Network I/O metrics - if stats.Networks != nil { - for _, netStats := range stats.Networks { - metrics.NetworkRxBytes += int64(netStats.RxBytes) - metrics.NetworkTxBytes += int64(netStats.TxBytes) - } - } - - return metrics -} - -// CollectBulkMetrics collects metrics for multiple containers -func (mc *MetricsCollector) CollectBulkMetrics(ctx context.Context, containerIDs []string) (map[string]*backendtypes.VMMetrics, error) { - ctx, span := mc.tracer.Start(ctx, "metald.docker.collect_bulk_metrics", - trace.WithAttributes(attribute.Int("container_count", len(containerIDs))), - ) - defer span.End() - - results := make(map[string]*backendtypes.VMMetrics) - - for _, containerID := range containerIDs { - metrics, err := mc.CollectMetrics(ctx, containerID) - if err != nil { - mc.logger.WarnContext(ctx, "failed to collect metrics for container", - slog.String("container_id", containerID), - slog.String("error", err.Error()), - ) - continue - } - results[containerID] = metrics - } - - span.SetAttributes(attribute.Int("successful_collections", len(results))) - return results, nil -} - -// StreamMetrics streams metrics for a container (for real-time monitoring) -func (mc *MetricsCollector) StreamMetrics(ctx context.Context, containerID string, interval time.Duration) (<-chan *backendtypes.VMMetrics, <-chan error) { - metricsChan := make(chan *backendtypes.VMMetrics, 1) - errorChan := make(chan error, 1) - - go func() { - defer close(metricsChan) - defer close(errorChan) - - ticker := time.NewTicker(interval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - metrics, err := mc.CollectMetrics(ctx, containerID) - if err != nil { - select { - case errorChan <- err: - case <-ctx.Done(): - return - } - continue - } - - select { - case metricsChan <- metrics: - case <-ctx.Done(): - return - } - } - } - }() - - return metricsChan, errorChan -} - -// CalculateCPUPercent calculates CPU percentage from Docker stats -func (mc *MetricsCollector) CalculateCPUPercent(current, previous *container.StatsResponse) float64 { - if current == nil || previous == nil { - return 0.0 - } - - // Calculate CPU delta - cpuDelta := float64(current.CPUStats.CPUUsage.TotalUsage - previous.CPUStats.CPUUsage.TotalUsage) - - // Calculate system CPU delta - systemDelta := float64(current.CPUStats.SystemUsage - previous.CPUStats.SystemUsage) - - // Calculate number of CPUs - onlineCPUs := float64(current.CPUStats.OnlineCPUs) - if onlineCPUs == 0 { - onlineCPUs = float64(len(current.CPUStats.CPUUsage.PercpuUsage)) - } - - if systemDelta > 0 && cpuDelta > 0 { - return (cpuDelta / systemDelta) * onlineCPUs * 100.0 - } - - return 0.0 -} - -// CalculateMemoryPercent calculates memory percentage from Docker stats -func (mc *MetricsCollector) CalculateMemoryPercent(stats *container.StatsResponse) float64 { - if stats == nil || stats.MemoryStats.Limit == 0 { - return 0.0 - } - - // Calculate memory usage percentage - usage := float64(stats.MemoryStats.Usage) - limit := float64(stats.MemoryStats.Limit) - - return (usage / limit) * 100.0 -} - -// CalculateNetworkIORate calculates network I/O rate from Docker stats -func (mc *MetricsCollector) CalculateNetworkIORate(current, previous *container.StatsResponse, timeDelta time.Duration) (rxRate, txRate float64) { - if current == nil || previous == nil || timeDelta == 0 { - return 0.0, 0.0 - } - - var currentRx, currentTx, previousRx, previousTx uint64 - - // Sum network stats from all interfaces - for _, netStats := range current.Networks { - currentRx += netStats.RxBytes - currentTx += netStats.TxBytes - } - - for _, netStats := range previous.Networks { - previousRx += netStats.RxBytes - previousTx += netStats.TxBytes - } - - // Calculate rates (bytes per second) - seconds := timeDelta.Seconds() - rxRate = float64(currentRx-previousRx) / seconds - txRate = float64(currentTx-previousTx) / seconds - - return rxRate, txRate -} - -// CalculateBlockIORate calculates block I/O rate from Docker stats -func (mc *MetricsCollector) CalculateBlockIORate(current, previous *container.StatsResponse, timeDelta time.Duration) (readRate, writeRate float64) { - if current == nil || previous == nil || timeDelta == 0 { - return 0.0, 0.0 - } - - var currentRead, currentWrite, previousRead, previousWrite uint64 - - // Sum block I/O stats - for _, blkio := range current.BlkioStats.IoServiceBytesRecursive { - switch blkio.Op { - case "Read": - currentRead += blkio.Value - case "Write": - currentWrite += blkio.Value - } - } - - for _, blkio := range previous.BlkioStats.IoServiceBytesRecursive { - switch blkio.Op { - case "Read": - previousRead += blkio.Value - case "Write": - previousWrite += blkio.Value - } - } - - // Calculate rates (bytes per second) - seconds := timeDelta.Seconds() - readRate = float64(currentRead-previousRead) / seconds - writeRate = float64(currentWrite-previousWrite) / seconds - - return readRate, writeRate -} - -// GetContainerResourceLimits gets resource limits for a container -func (mc *MetricsCollector) GetContainerResourceLimits(ctx context.Context, containerID string) (*ResourceLimits, error) { - ctx, span := mc.tracer.Start(ctx, "metald.docker.get_resource_limits", - trace.WithAttributes(attribute.String("container_id", containerID)), - ) - defer span.End() - - // Inspect container to get resource limits - inspect, err := mc.dockerClient.ContainerInspect(ctx, containerID) - if err != nil { - span.RecordError(err) - return nil, fmt.Errorf("failed to inspect container: %w", err) - } - - limits := &ResourceLimits{ - Memory: inspect.HostConfig.Memory, - NanoCPUs: inspect.HostConfig.NanoCPUs, - } - - return limits, nil -} - -// ResourceLimits represents resource limits for a container -type ResourceLimits struct { - Memory int64 // Memory limit in bytes - NanoCPUs int64 // CPU limit in nano CPUs -} - -// GetCPULimit returns CPU limit as number of CPUs -func (rl *ResourceLimits) GetCPULimit() float64 { - return float64(rl.NanoCPUs) / 1e9 -} - -// GetMemoryLimit returns memory limit in bytes -func (rl *ResourceLimits) GetMemoryLimit() int64 { - return rl.Memory -} - -// AIDEV-NOTE: Docker metrics collection provides comprehensive monitoring -// capabilities for containers treated as VMs. Key features: -// 1. Real-time metrics collection via Docker stats API -// 2. CPU, memory, disk I/O, and network I/O monitoring -// 3. Streaming metrics for continuous monitoring -// 4. Resource limit awareness for accurate percentage calculations -// 5. Bulk metrics collection for efficient monitoring of multiple containers \ No newline at end of file +// import ( +// "context" +// "encoding/json" +// "fmt" +// "log/slog" +// "time" + +// "github.com/docker/docker/api/types/container" +// "github.com/docker/docker/client" +// backendtypes "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" +// "go.opentelemetry.io/otel/attribute" +// "go.opentelemetry.io/otel/trace" +// ) + +// // MetricsCollector collects metrics from Docker containers +// type MetricsCollector struct { +// logger *slog.Logger +// dockerClient *client.Client +// tracer trace.Tracer +// } + +// // NewMetricsCollector creates a new metrics collector +// func NewMetricsCollector(logger *slog.Logger, dockerClient *client.Client, tracer trace.Tracer) *MetricsCollector { +// return &MetricsCollector{ +// logger: logger.With("component", "docker-metrics"), +// dockerClient: dockerClient, +// tracer: tracer, +// } +// } + +// // CollectMetrics collects metrics for a specific container +// func (mc *MetricsCollector) CollectMetrics(ctx context.Context, containerID string) (*backendtypes.VMMetrics, error) { +// ctx, span := mc.tracer.Start(ctx, "metald.docker.collect_metrics", +// trace.WithAttributes(attribute.String("container_id", containerID)), +// ) +// defer span.End() + +// // Get container stats (single read, not streaming) +// stats, err := mc.dockerClient.ContainerStats(ctx, containerID, false) +// if err != nil { +// span.RecordError(err) +// return nil, fmt.Errorf("failed to get container stats: %w", err) +// } +// defer stats.Body.Close() + +// // Parse stats JSON +// var dockerStats container.StatsResponse +// if err := json.NewDecoder(stats.Body).Decode(&dockerStats); err != nil { +// span.RecordError(err) +// return nil, fmt.Errorf("failed to decode container stats: %w", err) +// } + +// // Convert to VM metrics +// metrics := mc.convertDockerStatsToVMMetrics(&dockerStats) + +// mc.logger.DebugContext(ctx, "collected container metrics", +// slog.String("container_id", containerID), +// slog.Int64("cpu_time_nanos", metrics.CpuTimeNanos), +// slog.Int64("memory_usage_bytes", metrics.MemoryUsageBytes), +// slog.Int64("disk_read_bytes", metrics.DiskReadBytes), +// slog.Int64("disk_write_bytes", metrics.DiskWriteBytes), +// slog.Int64("network_rx_bytes", metrics.NetworkRxBytes), +// slog.Int64("network_tx_bytes", metrics.NetworkTxBytes), +// ) + +// return metrics, nil +// } + +// // convertDockerStatsToVMMetrics converts Docker stats to VM metrics format +// func (mc *MetricsCollector) convertDockerStatsToVMMetrics(stats *container.StatsResponse) *backendtypes.VMMetrics { +// metrics := &backendtypes.VMMetrics{ +// Timestamp: time.Now(), +// CpuTimeNanos: 0, +// MemoryUsageBytes: 0, +// DiskReadBytes: 0, +// DiskWriteBytes: 0, +// NetworkRxBytes: 0, +// NetworkTxBytes: 0, +// } + +// // CPU metrics +// if stats.CPUStats.CPUUsage.TotalUsage > 0 { +// metrics.CpuTimeNanos = safeUint64ToInt64(stats.CPUStats.CPUUsage.TotalUsage) +// } + +// // Memory metrics +// if stats.MemoryStats.Usage > 0 { +// metrics.MemoryUsageBytes = safeUint64ToInt64(stats.MemoryStats.Usage) +// } + +// // Disk I/O metrics +// if stats.BlkioStats.IoServiceBytesRecursive != nil { +// for _, blkio := range stats.BlkioStats.IoServiceBytesRecursive { +// switch blkio.Op { +// case "Read": +// metrics.DiskReadBytes += safeUint64ToInt64(blkio.Value) +// case "Write": +// metrics.DiskWriteBytes += safeUint64ToInt64(blkio.Value) +// } +// } +// } + +// // Network I/O metrics +// if stats.Networks != nil { +// for _, netStats := range stats.Networks { +// metrics.NetworkRxBytes += safeUint64ToInt64(netStats.RxBytes) +// metrics.NetworkTxBytes += safeUint64ToInt64(netStats.TxBytes) +// } +// } + +// return metrics +// } + +// // CollectBulkMetrics collects metrics for multiple containers +// func (mc *MetricsCollector) CollectBulkMetrics(ctx context.Context, containerIDs []string) (map[string]*backendtypes.VMMetrics, error) { +// ctx, span := mc.tracer.Start(ctx, "metald.docker.collect_bulk_metrics", +// trace.WithAttributes(attribute.Int("container_count", len(containerIDs))), +// ) +// defer span.End() + +// results := make(map[string]*backendtypes.VMMetrics) + +// for _, containerID := range containerIDs { +// metrics, err := mc.CollectMetrics(ctx, containerID) +// if err != nil { +// mc.logger.WarnContext(ctx, "failed to collect metrics for container", +// slog.String("container_id", containerID), +// slog.String("error", err.Error()), +// ) +// continue +// } +// results[containerID] = metrics +// } + +// span.SetAttributes(attribute.Int("successful_collections", len(results))) +// return results, nil +// } + +// // StreamMetrics streams metrics for a container (for real-time monitoring) +// func (mc *MetricsCollector) StreamMetrics(ctx context.Context, containerID string, interval time.Duration) (<-chan *backendtypes.VMMetrics, <-chan error) { +// metricsChan := make(chan *backendtypes.VMMetrics, 1) +// errorChan := make(chan error, 1) + +// go func() { +// defer close(metricsChan) +// defer close(errorChan) + +// ticker := time.NewTicker(interval) +// defer ticker.Stop() + +// for { +// select { +// case <-ctx.Done(): +// return +// case <-ticker.C: +// metrics, err := mc.CollectMetrics(ctx, containerID) +// if err != nil { +// select { +// case errorChan <- err: +// case <-ctx.Done(): +// return +// } +// continue +// } + +// select { +// case metricsChan <- metrics: +// case <-ctx.Done(): +// return +// } +// } +// } +// }() + +// return metricsChan, errorChan +// } + +// // CalculateCPUPercent calculates CPU percentage from Docker stats +// func (mc *MetricsCollector) CalculateCPUPercent(current, previous *container.StatsResponse) float64 { +// if current == nil || previous == nil { +// return 0.0 +// } + +// // Calculate CPU delta +// cpuDelta := float64(current.CPUStats.CPUUsage.TotalUsage - previous.CPUStats.CPUUsage.TotalUsage) + +// // Calculate system CPU delta +// systemDelta := float64(current.CPUStats.SystemUsage - previous.CPUStats.SystemUsage) + +// // Calculate number of CPUs +// onlineCPUs := float64(current.CPUStats.OnlineCPUs) +// if onlineCPUs == 0 { +// onlineCPUs = float64(len(current.CPUStats.CPUUsage.PercpuUsage)) +// } + +// if systemDelta > 0 && cpuDelta > 0 { +// return (cpuDelta / systemDelta) * onlineCPUs * 100.0 +// } + +// return 0.0 +// } + +// // CalculateMemoryPercent calculates memory percentage from Docker stats +// func (mc *MetricsCollector) CalculateMemoryPercent(stats *container.StatsResponse) float64 { +// if stats == nil || stats.MemoryStats.Limit == 0 { +// return 0.0 +// } + +// // Calculate memory usage percentage +// usage := float64(stats.MemoryStats.Usage) +// limit := float64(stats.MemoryStats.Limit) + +// return (usage / limit) * 100.0 +// } + +// // CalculateNetworkIORate calculates network I/O rate from Docker stats +// func (mc *MetricsCollector) CalculateNetworkIORate(current, previous *container.StatsResponse, timeDelta time.Duration) (rxRate, txRate float64) { +// if current == nil || previous == nil || timeDelta == 0 { +// return 0.0, 0.0 +// } + +// var currentRx, currentTx, previousRx, previousTx uint64 + +// // Sum network stats from all interfaces +// for _, netStats := range current.Networks { +// currentRx += netStats.RxBytes +// currentTx += netStats.TxBytes +// } + +// for _, netStats := range previous.Networks { +// previousRx += netStats.RxBytes +// previousTx += netStats.TxBytes +// } + +// // Calculate rates (bytes per second) +// seconds := timeDelta.Seconds() +// rxRate = float64(currentRx-previousRx) / seconds +// txRate = float64(currentTx-previousTx) / seconds + +// return rxRate, txRate +// } + +// // CalculateBlockIORate calculates block I/O rate from Docker stats +// func (mc *MetricsCollector) CalculateBlockIORate(current, previous *container.StatsResponse, timeDelta time.Duration) (readRate, writeRate float64) { +// if current == nil || previous == nil || timeDelta == 0 { +// return 0.0, 0.0 +// } + +// var currentRead, currentWrite, previousRead, previousWrite uint64 + +// // Sum block I/O stats +// for _, blkio := range current.BlkioStats.IoServiceBytesRecursive { +// switch blkio.Op { +// case "Read": +// currentRead += blkio.Value +// case "Write": +// currentWrite += blkio.Value +// } +// } + +// for _, blkio := range previous.BlkioStats.IoServiceBytesRecursive { +// switch blkio.Op { +// case "Read": +// previousRead += blkio.Value +// case "Write": +// previousWrite += blkio.Value +// } +// } + +// // Calculate rates (bytes per second) +// seconds := timeDelta.Seconds() +// readRate = float64(currentRead-previousRead) / seconds +// writeRate = float64(currentWrite-previousWrite) / seconds + +// return readRate, writeRate +// } + +// // GetContainerResourceLimits gets resource limits for a container +// func (mc *MetricsCollector) GetContainerResourceLimits(ctx context.Context, containerID string) (*ResourceLimits, error) { +// ctx, span := mc.tracer.Start(ctx, "metald.docker.get_resource_limits", +// trace.WithAttributes(attribute.String("container_id", containerID)), +// ) +// defer span.End() + +// // Inspect container to get resource limits +// inspect, err := mc.dockerClient.ContainerInspect(ctx, containerID) +// if err != nil { +// span.RecordError(err) +// return nil, fmt.Errorf("failed to inspect container: %w", err) +// } + +// limits := &ResourceLimits{ +// Memory: inspect.HostConfig.Memory, +// NanoCPUs: inspect.HostConfig.NanoCPUs, +// } + +// return limits, nil +// } + +// // ResourceLimits represents resource limits for a container +// type ResourceLimits struct { +// Memory int64 // Memory limit in bytes +// NanoCPUs int64 // CPU limit in nano CPUs +// } + +// // GetCPULimit returns CPU limit as number of CPUs +// func (rl *ResourceLimits) GetCPULimit() float64 { +// return float64(rl.NanoCPUs) / 1e9 +// } + +// // GetMemoryLimit returns memory limit in bytes +// func (rl *ResourceLimits) GetMemoryLimit() int64 { +// return rl.Memory +// } + +// // AIDEV-NOTE: Docker metrics collection provides comprehensive monitoring +// // capabilities for containers treated as VMs. Key features: +// // 1. Real-time metrics collection via Docker stats API +// // 2. CPU, memory, disk I/O, and network I/O monitoring +// // 3. Streaming metrics for continuous monitoring +// // 4. Resource limit awareness for accurate percentage calculations +// // 5. Bulk metrics collection for efficient monitoring of multiple containers diff --git a/go/deploy/metald/internal/backend/docker/types.go b/go/deploy/metald/internal/backend/docker/types.go index 87d26876ed..42e4e4c358 100644 --- a/go/deploy/metald/internal/backend/docker/types.go +++ b/go/deploy/metald/internal/backend/docker/types.go @@ -1,139 +1,137 @@ package docker -import ( - "time" - - "github.com/docker/go-connections/nat" - "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" -) - -// dockerVM represents a "VM" managed as a Docker container -type dockerVM struct { - ID string - ContainerID string - Config *metaldv1.VmConfig - State metaldv1.VmState - NetworkInfo *metaldv1.VmNetworkInfo - PortMappings []PortMapping - CreatedAt time.Time -} - -// PortMapping represents a port mapping between container and host -type PortMapping struct { - ContainerPort int - HostPort int - Protocol string -} - -// ContainerSpec represents the Docker container specification -type ContainerSpec struct { - Image string - Cmd []string - Env []string - ExposedPorts []string - PortMappings []PortMapping - Labels map[string]string - Memory int64 - CPUs float64 - WorkingDir string -} - -// DockerBackendConfig represents configuration for the Docker backend -type DockerBackendConfig struct { - // DockerHost is the Docker daemon socket (defaults to unix:///var/run/docker.sock) - DockerHost string `json:"docker_host,omitempty"` - - // NetworkName is the Docker network to use for containers (defaults to bridge) - NetworkName string `json:"network_name,omitempty"` - - // ContainerPrefix is the prefix for container names (defaults to unkey-vm-) - ContainerPrefix string `json:"container_prefix,omitempty"` - - - // PortRange defines the range of host ports to allocate - PortRange struct { - Min int `json:"min"` - Max int `json:"max"` - } `json:"port_range,omitempty"` - - // AutoRemove determines if containers should be automatically removed on exit - AutoRemove bool `json:"auto_remove,omitempty"` - - // Privileged determines if containers run in privileged mode - Privileged bool `json:"privileged,omitempty"` -} - -// DefaultDockerBackendConfig returns default configuration for Docker backend -func DefaultDockerBackendConfig() *DockerBackendConfig { - return &DockerBackendConfig{ - DockerHost: "", // Use default Docker socket - NetworkName: "bridge", - ContainerPrefix: "unkey-vm-", - PortRange: struct { - Min int `json:"min"` - Max int `json:"max"` - }{ - Min: 30000, - Max: 40000, - }, - AutoRemove: true, - Privileged: false, - } -} - -// DockerMetrics represents metrics collected from Docker stats API -type DockerMetrics struct { - Timestamp time.Time - CPUUsagePercent float64 - MemoryUsageBytes int64 - MemoryLimitBytes int64 - NetworkRxBytes int64 - NetworkTxBytes int64 - BlockReadBytes int64 - BlockWriteBytes int64 - PIDs int64 -} - -// ToVMMetrics converts DockerMetrics to types.VMMetrics -func (dm *DockerMetrics) ToVMMetrics() *types.VMMetrics { - return &types.VMMetrics{ - Timestamp: dm.Timestamp, - CpuTimeNanos: int64(dm.CPUUsagePercent * 1e9), // Convert percentage to nanoseconds approximation - MemoryUsageBytes: dm.MemoryUsageBytes, - DiskReadBytes: dm.BlockReadBytes, - DiskWriteBytes: dm.BlockWriteBytes, - NetworkRxBytes: dm.NetworkRxBytes, - NetworkTxBytes: dm.NetworkTxBytes, - } -} - -// ContainerCreateOptions represents options for creating a Docker container -type ContainerCreateOptions struct { - Name string - Image string - Cmd []string - Env []string - Labels map[string]string - ExposedPorts map[string]struct{} - PortBindings map[string][]nat.PortBinding - Memory int64 - CPUs float64 - WorkingDir string - AutoRemove bool - Privileged bool - NetworkMode string -} - -// NetworkCreateOptions represents options for creating a Docker network -type NetworkCreateOptions struct { - Name string - Driver string - Internal bool - Labels map[string]string -} - -// AIDEV-NOTE: Docker backend types provide a clean abstraction layer -// between the VM concepts used by metald and Docker container operations. -// This allows metald to treat Docker containers as VMs while maintaining -// the same interface contract as the Firecracker backend. \ No newline at end of file +// import ( +// "time" + +// "github.com/docker/go-connections/nat" +// "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" +// metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" +// ) + +// // dockerVM represents a "VM" managed as a Docker container +// type dockerVM struct { +// ID string +// ContainerID string +// Config *metaldv1.VmConfig +// State metaldv1.VmState +// PortMappings []PortMapping +// CreatedAt time.Time +// } + +// // PortMapping represents a port mapping between container and host +// type PortMapping struct { +// ContainerPort int +// HostPort int +// Protocol string +// } + +// // ContainerSpec represents the Docker container specification +// type ContainerSpec struct { +// Image string +// Cmd []string +// Env []string +// ExposedPorts []string +// PortMappings []PortMapping +// Labels map[string]string +// Memory int64 +// CPUs float64 +// WorkingDir string +// } + +// // DockerBackendConfig represents configuration for the Docker backend +// type DockerBackendConfig struct { +// // DockerHost is the Docker daemon socket (defaults to unix:///var/run/docker.sock) +// DockerHost string `json:"docker_host,omitempty"` + +// // NetworkName is the Docker network to use for containers (defaults to bridge) +// NetworkName string `json:"network_name,omitempty"` + +// // ContainerPrefix is the prefix for container names (defaults to unkey-vm-) +// ContainerPrefix string `json:"container_prefix,omitempty"` + +// // PortRange defines the range of host ports to allocate +// PortRange struct { +// Min int `json:"min"` +// Max int `json:"max"` +// } `json:"port_range,omitempty"` + +// // AutoRemove determines if containers should be automatically removed on exit +// AutoRemove bool `json:"auto_remove,omitempty"` + +// // Privileged determines if containers run in privileged mode +// Privileged bool `json:"privileged,omitempty"` +// } + +// // DefaultDockerBackendConfig returns default configuration for Docker backend +// func DefaultDockerBackendConfig() *DockerBackendConfig { +// return &DockerBackendConfig{ +// DockerHost: "", // Use default Docker socket +// NetworkName: "bridge", +// ContainerPrefix: "unkey-vm-", +// PortRange: struct { +// Min int `json:"min"` +// Max int `json:"max"` +// }{ +// Min: 30000, +// Max: 40000, +// }, +// AutoRemove: true, +// Privileged: false, +// } +// } + +// // DockerMetrics represents metrics collected from Docker stats API +// type DockerMetrics struct { +// Timestamp time.Time +// CPUUsagePercent float64 +// MemoryUsageBytes int64 +// MemoryLimitBytes int64 +// NetworkRxBytes int64 +// NetworkTxBytes int64 +// BlockReadBytes int64 +// BlockWriteBytes int64 +// PIDs int64 +// } + +// // ToVMMetrics converts DockerMetrics to types.VMMetrics +// func (dm *DockerMetrics) ToVMMetrics() *types.VMMetrics { +// return &types.VMMetrics{ +// Timestamp: dm.Timestamp, +// CpuTimeNanos: int64(dm.CPUUsagePercent * 1e9), // Convert percentage to nanoseconds approximation +// MemoryUsageBytes: dm.MemoryUsageBytes, +// DiskReadBytes: dm.BlockReadBytes, +// DiskWriteBytes: dm.BlockWriteBytes, +// NetworkRxBytes: dm.NetworkRxBytes, +// NetworkTxBytes: dm.NetworkTxBytes, +// } +// } + +// // ContainerCreateOptions represents options for creating a Docker container +// type ContainerCreateOptions struct { +// Name string +// Image string +// Cmd []string +// Env []string +// Labels map[string]string +// ExposedPorts map[string]struct{} +// PortBindings map[string][]nat.PortBinding +// Memory int64 +// CPUs float64 +// WorkingDir string +// AutoRemove bool +// Privileged bool +// NetworkMode string +// } + +// // NetworkCreateOptions represents options for creating a Docker network +// type NetworkCreateOptions struct { +// Name string +// Driver string +// Internal bool +// Labels map[string]string +// } + +// // AIDEV-NOTE: Docker backend types provide a clean abstraction layer +// // between the VM concepts used by metald and Docker container operations. +// // This allows metald to treat Docker containers as VMs while maintaining +// // the same interface contract as the Firecracker backend. diff --git a/go/deploy/metald/internal/backend/firecracker/asset_manager.go b/go/deploy/metald/internal/backend/firecracker/asset_manager.go new file mode 100644 index 0000000000..630d7edd04 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/asset_manager.go @@ -0,0 +1,511 @@ +package firecracker + +import ( + "context" + "crypto/sha256" + "fmt" + "log/slog" + "os" + "path/filepath" + "sort" + "strings" + + assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" +) + +// releaseAssetLeases releases all asset leases for a VM +func (c *Client) releaseAssetLeases(ctx context.Context, vmID string) { + if leaseIDs, ok := c.vmAssetLeases[vmID]; ok { + c.logger.LogAttrs(ctx, slog.LevelInfo, "releasing asset leases", + slog.String("vm_id", vmID), + slog.Int("lease_count", len(leaseIDs)), + ) + + for _, leaseID := range leaseIDs { + releaseCtx, releaseSpan := c.tracer.Start(ctx, "metald.firecracker.release_asset", + trace.WithAttributes( + attribute.String("vm.id", vmID), + attribute.String("lease.id", leaseID), + ), + ) + err := c.assetClient.ReleaseAsset(releaseCtx, leaseID) + if err != nil { + releaseSpan.RecordError(err) + releaseSpan.SetStatus(codes.Error, err.Error()) + c.logger.ErrorContext(ctx, "failed to release asset lease", + "vm_id", vmID, + "lease_id", leaseID, + "error", err, + ) + // Continue with other leases even if one fails + } + releaseSpan.End() + } + delete(c.vmAssetLeases, vmID) + } +} + +// acquireAssetLeases acquires leases for VM assets after successful boot +func (c *Client) acquireAssetLeases(ctx context.Context, vmID string, assetMapping *assetMapping) { + if assetMapping == nil || len(assetMapping.AssetIDs()) == 0 { + return + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "acquiring asset leases for VM", + slog.String("vm_id", vmID), + slog.Int("asset_count", len(assetMapping.AssetIDs())), + ) + + leaseIDs := []string{} + for _, assetID := range assetMapping.AssetIDs() { + acquireCtx, acquireSpan := c.tracer.Start(ctx, "metald.firecracker.acquire_asset", + trace.WithAttributes( + attribute.String("vm.id", vmID), + attribute.String("asset.id", assetID), + ), + ) + + leaseID, err := c.assetClient.AcquireAsset(acquireCtx, assetID, vmID) + if err != nil { + acquireSpan.RecordError(err) + acquireSpan.SetStatus(codes.Error, err.Error()) + c.logger.ErrorContext(ctx, "failed to acquire asset lease", + "vm_id", vmID, + "asset_id", assetID, + "error", err, + ) + // Continue trying to acquire other leases even if one fails + } else { + acquireSpan.SetAttributes(attribute.String("lease.id", leaseID)) + leaseIDs = append(leaseIDs, leaseID) + } + acquireSpan.End() + } + + // Store lease IDs for cleanup during VM deletion + if len(leaseIDs) > 0 { + c.vmAssetLeases[vmID] = leaseIDs + c.logger.LogAttrs(ctx, slog.LevelInfo, "acquired asset leases", + slog.String("vm_id", vmID), + slog.Int("lease_count", len(leaseIDs)), + ) + } +} + +// generateAssetID generates a deterministic asset ID based on type and labels +func (c *Client) generateAssetID(assetType assetv1.AssetType, labels map[string]string) string { + // Create a deterministic string from sorted labels + var parts []string + parts = append(parts, fmt.Sprintf("type=%s", assetType.String())) + + // Sort label keys for deterministic ordering + var keys []string + for k := range labels { + keys = append(keys, k) + } + sort.Strings(keys) + + // Add sorted labels + for _, k := range keys { + parts = append(parts, fmt.Sprintf("%s=%s", k, labels[k])) + } + + // Create a hash of the combined string + combined := strings.Join(parts, ",") + hash := sha256.Sum256([]byte(combined)) + + // Return a readable asset ID + return fmt.Sprintf("asset-%x", hash[:8]) +} + +// prepareVMAssets prepares kernel and rootfs assets for the VM in the jailer chroot +// Returns the asset mapping for lease acquisition after successful boot +func (c *Client) prepareVMAssets(ctx context.Context, vmID string, config *metaldv1.VmConfig) (*assetMapping, map[string]string, error) { + // Calculate the jailer chroot path + jailerRoot := filepath.Join( + c.jailerConfig.ChrootBaseDir, + "firecracker", + vmID, + "root", + ) + + c.logger.LogAttrs(ctx, slog.LevelInfo, "preparing VM assets using assetmanager", + slog.String("vm_id", vmID), + slog.String("target_path", jailerRoot), + ) + + // Ensure the jailer root directory exists + if err := os.MkdirAll(jailerRoot, 0o755); err != nil { + return nil, nil, fmt.Errorf("failed to create jailer root directory: %w", err) + } + + // Check if assetmanager is available, fallback to static if not + // TODO: implement a check with backoff/deadline + + // Build asset requirements from VM configuration + requiredAssets := c.buildAssetRequirements(config) + c.logger.LogAttrs(ctx, slog.LevelDebug, "determined asset requirements", + slog.String("vm_id", vmID), + slog.Int("required_count", len(requiredAssets)), + ) + + // Query and build assets as needed + allAssets, err := c.queryAndBuildAssets(ctx, vmID, config, requiredAssets) + if err != nil { + return nil, nil, fmt.Errorf("failed to query/build assets: %w", err) + } + + // Match required assets with available ones + assetMapping, err := c.matchAssets(requiredAssets, allAssets) + if err != nil { + c.logger.LogAttrs(ctx, slog.LevelError, "failed to match assets", + slog.String("vm_id", vmID), + slog.String("error", err.Error()), + ) + return nil, nil, fmt.Errorf("asset matching failed: %w", err) + } + + // Prepare assets in target location + preparedPaths, err := c.prepareAssetsInLocation(ctx, vmID, assetMapping, jailerRoot) + if err != nil { + return nil, nil, fmt.Errorf("failed to prepare assets: %w", err) + } + + // Copy metadata files alongside rootfs assets if they exist + if err := c.copyMetadataFilesForAssets(ctx, vmID, config, preparedPaths, jailerRoot); err != nil { + c.logger.WarnContext(ctx, "failed to copy metadata files", + slog.String("vm_id", vmID), + slog.String("error", err.Error()), + ) + // Don't fail asset preparation for metadata issues + } + + return assetMapping, preparedPaths, nil +} + +// isAssetManagerAvailable checks if the asset manager service is available +func (c *Client) isAssetManagerAvailable(ctx context.Context, vmID string) bool { + ctx, checkSpan := c.tracer.Start(ctx, "metald.firecracker.check_assetmanager", + trace.WithAttributes( + attribute.String("vm.id", vmID), + attribute.String("asset.type", "KERNEL"), + ), + ) + _, err := c.assetClient.QueryAssets(ctx, assetv1.AssetType_ASSET_TYPE_KERNEL, nil, nil) + checkSpan.End() + return err == nil +} + +// queryAndBuildAssets queries assetmanager for available assets with automatic build support +func (c *Client) queryAndBuildAssets(ctx context.Context, vmID string, config *metaldv1.VmConfig, requiredAssets []assetRequirement) ([]*assetv1.Asset, error) { + allAssets := []*assetv1.Asset{} + + // Extract tenant_id from VM metadata if available + tenantID := "cli-tenant" // Default tenant for CLI operations + if tid, ok := config.GetMetadata()["tenant_id"]; ok { + tenantID = tid + } + + // Group requirements by type and labels for efficient querying + queryGroups := c.groupAssetRequirements(requiredAssets) + + // Query each unique combination of type and labels + for key, reqs := range queryGroups { + assets, err := c.queryAssetGroup(ctx, vmID, config, key, reqs[0], tenantID) + if err != nil { + return nil, err + } + allAssets = append(allAssets, assets...) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "retrieved available assets", + slog.String("vm_id", vmID), + slog.Int("available_count", len(allAssets)), + ) + + // Log asset details for debugging + for _, asset := range allAssets { + c.logger.LogAttrs(ctx, slog.LevelInfo, "available asset", + slog.String("asset_id", asset.GetId()), + slog.String("asset_type", asset.GetType().String()), + slog.Any("labels", asset.GetLabels()), + ) + } + + return allAssets, nil +} + +// groupAssetRequirements groups requirements by type and labels for efficient querying +func (c *Client) groupAssetRequirements(requiredAssets []assetRequirement) map[queryKey][]assetRequirement { + queryGroups := make(map[queryKey][]assetRequirement) + for _, req := range requiredAssets { + // Serialize labels for grouping + labelStr := "" + for k, v := range req.Labels { + if labelStr != "" { + labelStr += "," + } + labelStr += fmt.Sprintf("%s=%s", k, v) + } + key := queryKey{assetType: req.Type, labels: labelStr} + queryGroups[key] = append(queryGroups[key], req) + } + return queryGroups +} + +// queryAssetGroup queries a specific group of assets with the same type and labels +func (c *Client) queryAssetGroup(ctx context.Context, vmID string, config *metaldv1.VmConfig, key queryKey, req assetRequirement, tenantID string) ([]*assetv1.Asset, error) { + labels := req.Labels + + // Generate a deterministic asset ID + assetID := c.generateAssetID(key.assetType, labels) + + c.logger.LogAttrs(ctx, slog.LevelInfo, "generated asset ID for query", + slog.String("asset_id", assetID), + slog.String("asset_type", key.assetType.String()), + slog.Any("labels", labels), + ) + + // Configure build options + buildOptions := c.createBuildOptions(config, labels, tenantID, assetID) + + // Record query initiation + _, initSpan := c.tracer.Start(ctx, "metald.firecracker.query_assets", + trace.WithAttributes( + attribute.String("vm.id", vmID), + attribute.String("asset.type", key.assetType.String()), + attribute.StringSlice("asset.labels", formatLabels(labels)), + attribute.String("tenant.id", tenantID), + attribute.Bool("auto_build.enabled", buildOptions.GetEnableAutoBuild()), + attribute.Int("build.timeout_seconds", int(buildOptions.GetBuildTimeoutSeconds())), + ), + ) + initSpan.End() + + // Query assets + resp, queryErr := c.assetClient.QueryAssets(ctx, key.assetType, labels, buildOptions) + if queryErr != nil { + return nil, fmt.Errorf("failed to query assets of type %s with labels %v: %w", + key.assetType.String(), labels, queryErr) + } + + // Record results + _, resultSpan := c.tracer.Start(ctx, "metald.firecracker.query_assets_complete", + trace.WithAttributes( + attribute.String("vm.id", vmID), + attribute.String("asset.type", key.assetType.String()), + attribute.Int("assets.found", len(resp.GetAssets())), + attribute.Int("builds.triggered", len(resp.GetTriggeredBuilds())), + ), + ) + resultSpan.End() + + // Log triggered builds + c.logTriggeredBuilds(ctx, vmID, resp.GetTriggeredBuilds()) + + return resp.GetAssets(), nil +} + +// createBuildOptions creates build options for asset queries +func (c *Client) createBuildOptions(config *metaldv1.VmConfig, labels map[string]string, tenantID, assetID string) *assetv1.BuildOptions { + // Create build labels (copy asset labels and add force_rebuild if needed) + buildLabels := make(map[string]string) + for k, v := range labels { + buildLabels[k] = v + } + + // Check for force_rebuild in VM config metadata + if forceRebuild, ok := config.GetMetadata()["force_rebuild"]; ok && forceRebuild == "true" { + buildLabels["force_rebuild"] = "true" + } + + return &assetv1.BuildOptions{ + EnableAutoBuild: true, + WaitForCompletion: true, // Block VM creation until build completes + BuildTimeoutSeconds: 1800, // 30 minutes maximum wait time + TenantId: tenantID, + SuggestedAssetId: assetID, + BuildLabels: buildLabels, + } +} + +// logTriggeredBuilds logs information about builds that were triggered +func (c *Client) logTriggeredBuilds(ctx context.Context, vmID string, builds []*assetv1.BuildInfo) { + for _, build := range builds { + c.logger.LogAttrs(ctx, slog.LevelInfo, "automatic build triggered for missing asset", + slog.String("vm_id", vmID), + slog.String("build_id", build.GetBuildId()), + slog.String("docker_image", build.GetDockerImage()), + slog.String("status", build.GetStatus()), + ) + + if build.GetStatus() == "failed" { + c.logger.LogAttrs(ctx, slog.LevelError, "automatic build failed", + slog.String("vm_id", vmID), + slog.String("build_id", build.GetBuildId()), + slog.String("error", build.GetErrorMessage()), + ) + } + } +} + +// prepareAssetsInLocation prepares assets in the target location +func (c *Client) prepareAssetsInLocation(ctx context.Context, vmID string, assetMapping *assetMapping, jailerRoot string) (map[string]string, error) { + ctx, prepareSpan := c.tracer.Start(ctx, "metald.firecracker.prepare_assets", + trace.WithAttributes( + attribute.String("vm.id", vmID), + attribute.StringSlice("asset.ids", assetMapping.AssetIDs()), + attribute.String("target.path", jailerRoot), + ), + ) + + preparedPaths, err := c.assetClient.PrepareAssets( + ctx, + assetMapping.AssetIDs(), + jailerRoot, + vmID, + ) + + if err != nil { + prepareSpan.RecordError(err) + prepareSpan.SetStatus(codes.Error, err.Error()) + } else { + prepareSpan.SetAttributes( + attribute.Int("assets.prepared", len(preparedPaths)), + ) + } + prepareSpan.End() + + if err == nil { + c.logger.LogAttrs(ctx, slog.LevelInfo, "assets prepared successfully", + slog.String("vm_id", vmID), + slog.Int("asset_count", len(preparedPaths)), + ) + } + + return preparedPaths, err +} + +// formatLabels formats labels for tracing attributes +func formatLabels(labels map[string]string) []string { + var labelPairs []string + for k, v := range labels { + labelPairs = append(labelPairs, fmt.Sprintf("%s=%s", k, v)) + } + return labelPairs +} + +// buildAssetRequirements analyzes VM config to determine required assets +func (c *Client) buildAssetRequirements(config *metaldv1.VmConfig) []assetRequirement { + var reqs []assetRequirement + + // // DEBUG: Log VM config for docker image troubleshooting + // c.logger.Info("DEBUG: analyzing VM config for assets", + // "storage_count", len(config.GetStorage()), + // "metadata", config.GetMetadata(), + // ) + // for i, disk := range config.GetStorage() { + // c.logger.Info("DEBUG: storage device", + // "index", i, + // "id", disk.GetId(), + // "path", disk.GetPath(), + // "is_root", disk.GetIsRootDevice(), + // "options", disk.GetOptions(), + // ) + // } + + // // Kernel requirement + // if config.GetBoot() != nil && config.GetBoot().GetKernelPath() != "" { + // reqs = append(reqs, assetRequirement{ + // Type: assetv1.AssetType_ASSET_TYPE_KERNEL, + // Required: true, + // }) + // } + + // // Rootfs requirements from storage devices + // for _, disk := range config.GetStorage() { + // if disk.GetIsRootDevice() { + // labels := make(map[string]string) + // // Check for docker image in disk options first, then config metadata + // if dockerImage, ok := disk.GetOptions()["docker_image"]; ok { + // labels["docker_image"] = dockerImage + // } else if dockerImage, ok := config.GetMetadata()["docker_image"]; ok { + // labels["docker_image"] = dockerImage + // } + + // reqs = append(reqs, assetRequirement{ + // Type: assetv1.AssetType_ASSET_TYPE_ROOTFS, + // Labels: labels, + // Required: true, + // }) + // } + // } + + // // Initrd requirement (optional) + // if config.GetBoot() != nil && config.GetBoot().GetInitrdPath() != "" { + // reqs = append(reqs, assetRequirement{ + // Type: assetv1.AssetType_ASSET_TYPE_INITRD, + // Required: false, + // }) + // } + + return reqs +} + +// matchAssets matches available assets to requirements +func (c *Client) matchAssets(reqs []assetRequirement, availableAssets []*assetv1.Asset) (*assetMapping, error) { + mapping := &assetMapping{ + requirements: reqs, + assets: make(map[string]*assetv1.Asset), + assetIDs: []string{}, + } + + for i, req := range reqs { + var matched *assetv1.Asset + + // Find best matching asset + for _, asset := range availableAssets { + if asset.GetType() != req.Type { + continue + } + + // Check if all required labels match + labelMatch := true + for k, v := range req.Labels { + if assetLabel, ok := asset.GetLabels()[k]; !ok || assetLabel != v { + labelMatch = false + break + } + } + + if labelMatch { + matched = asset + break + } + } + + if matched == nil && req.Required { + // Build helpful error message + labelStr := "" + for k, v := range req.Labels { + if labelStr != "" { + labelStr += ", " + } + labelStr += fmt.Sprintf("%s=%s", k, v) + } + return nil, fmt.Errorf("no matching asset found for type %s with labels {%s}", + req.Type.String(), labelStr) + } + + if matched != nil { + mapping.assets[fmt.Sprintf("%d", i)] = matched + mapping.assetIDs = append(mapping.assetIDs, matched.GetId()) + } + } + + return mapping, nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/asset_static.go b/go/deploy/metald/internal/backend/firecracker/asset_static.go new file mode 100644 index 0000000000..f075d524a3 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/asset_static.go @@ -0,0 +1,101 @@ +package firecracker + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "os" + "os/exec" + "path/filepath" + "strings" + + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" +) + +// copyMetadataForRootDevice copies metadata and creates container.cmd for a root device +func (c *Client) copyMetadataForRootDevice(ctx context.Context, vmID string, disk *metaldv1.StorageDevice, jailerRoot string, diskDst string) error { + baseName := strings.TrimSuffix(filepath.Base(disk.GetPath()), filepath.Ext(disk.GetPath())) + metadataSrc := filepath.Join(filepath.Dir(disk.GetPath()), baseName+".metadata.json") + + // Check if metadata file exists + if _, err := os.Stat(metadataSrc); err != nil { + if os.IsNotExist(err) { + return nil // No metadata file is OK + } + return fmt.Errorf("failed to stat metadata file: %w", err) + } + + // Copy metadata file + metadataDst := filepath.Join(jailerRoot, filepath.Base(metadataSrc)) + if err := copyFileWithOwnership(metadataSrc, metadataDst, int(c.jailerConfig.UID), int(c.jailerConfig.GID)); err != nil { + return fmt.Errorf("failed to copy metadata file: %w", err) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "copied metadata file to jailer root", + slog.String("src", metadataSrc), + slog.String("dst", metadataDst), + ) + + // Load and process metadata to create container.cmd + metadata, err := c.loadContainerMetadata(ctx, disk.GetPath()) + if err != nil || metadata == nil { + return nil // Can't create container.cmd without metadata + } + + // Build the command array + var fullCmd []string + fullCmd = append(fullCmd, metadata.GetEntrypoint()...) + fullCmd = append(fullCmd, metadata.GetCommand()...) + + if len(fullCmd) == 0 { + return nil // No command to write + } + + // Write command file to rootfs by mounting it temporarily + return c.writeContainerCmdToRootfs(ctx, vmID, diskDst, fullCmd) +} + +// writeContainerCmdToRootfs mounts the rootfs and writes the container.cmd file +func (c *Client) writeContainerCmdToRootfs(ctx context.Context, vmID string, diskDst string, fullCmd []string) error { + // Create temporary mount directory + mountDir := filepath.Join("/tmp", fmt.Sprintf("mount-%s", vmID)) + if err := os.MkdirAll(mountDir, 0o755); err != nil { + return fmt.Errorf("failed to create mount directory: %w", err) + } + defer os.RemoveAll(mountDir) + + // Mount the rootfs ext4 image + mountCmd := exec.CommandContext(ctx, "mount", "-o", "loop", diskDst, mountDir) + if err := mountCmd.Run(); err != nil { + return fmt.Errorf("failed to mount rootfs: %w", err) + } + defer func() { + // Always unmount + umountCmd := exec.CommandContext(ctx, "umount", mountDir) + if err := umountCmd.Run(); err != nil { + c.logger.WarnContext(ctx, "failed to unmount rootfs", + "error", err, + "mountDir", mountDir, + ) + } + }() + + // Write the command file + cmdFile := filepath.Join(mountDir, "container.cmd") + cmdData, err := json.Marshal(fullCmd) + if err != nil { + return fmt.Errorf("failed to marshal command: %w", err) + } + + if err := os.WriteFile(cmdFile, cmdData, 0o600); err != nil { + return fmt.Errorf("failed to write command file: %w", err) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "wrote container command file to rootfs", + slog.String("path", cmdFile), + slog.String("command", string(cmdData)), + ) + + return nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/automatic_build_test.go b/go/deploy/metald/internal/backend/firecracker/automatic_build_test.go deleted file mode 100644 index effbdc31dc..0000000000 --- a/go/deploy/metald/internal/backend/firecracker/automatic_build_test.go +++ /dev/null @@ -1,327 +0,0 @@ -package firecracker - -import ( - "context" - "os" - "testing" - "time" - - "log/slog" - - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" - "github.com/unkeyed/unkey/go/deploy/metald/internal/config" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" -) - -// mockAssetClient implements assetmanager.Client for testing automatic builds -type mockAssetClient struct { - // Control behavior - triggerBuild bool - buildDelay time.Duration - buildError error - - // Track calls - queryCalls []queryCall - lastQuery *assetv1.QueryAssetsRequest -} - -type queryCall struct { - assetType assetv1.AssetType - labels map[string]string - buildOpts *assetv1.BuildOptions -} - -func (m *mockAssetClient) QueryAssets(ctx context.Context, assetType assetv1.AssetType, labels map[string]string, buildOptions *assetv1.BuildOptions) (*assetv1.QueryAssetsResponse, error) { - m.queryCalls = append(m.queryCalls, queryCall{ - assetType: assetType, - labels: labels, - buildOpts: buildOptions, - }) - - // For initial kernel check, return a kernel asset to indicate assetmanager is enabled - if assetType == assetv1.AssetType_ASSET_TYPE_KERNEL && buildOptions == nil { - return &assetv1.QueryAssetsResponse{ - Assets: []*assetv1.Asset{ - { - Id: "kernel-test", - Type: assetv1.AssetType_ASSET_TYPE_KERNEL, - }, - }, - }, nil - } - - // For kernel queries with build options, return a kernel asset - if assetType == assetv1.AssetType_ASSET_TYPE_KERNEL && buildOptions != nil { - return &assetv1.QueryAssetsResponse{ - Assets: []*assetv1.Asset{ - { - Id: "kernel-123", - Name: "vmlinux", - Type: assetv1.AssetType_ASSET_TYPE_KERNEL, - Status: assetv1.AssetStatus_ASSET_STATUS_AVAILABLE, - }, - }, - }, nil - } - - // Simulate no assets initially for rootfs queries - resp := &assetv1.QueryAssetsResponse{ - Assets: []*assetv1.Asset{}, - } - - // If build is triggered and enabled for rootfs - if m.triggerBuild && buildOptions != nil && buildOptions.EnableAutoBuild && assetType == assetv1.AssetType_ASSET_TYPE_ROOTFS { - dockerImage := labels["docker_image"] - - // Create build info - buildInfo := &assetv1.BuildInfo{ - BuildId: "test-build-123", - DockerImage: dockerImage, - Status: "building", - } - - // Simulate build delay - if m.buildDelay > 0 && buildOptions.WaitForCompletion { - select { - case <-time.After(m.buildDelay): - // Build completed - if m.buildError != nil { - buildInfo.Status = "failed" - buildInfo.ErrorMessage = m.buildError.Error() - } else { - buildInfo.Status = "completed" - buildInfo.AssetId = "test-asset-456" - - // Add the built asset to response - resp.Assets = append(resp.Assets, &assetv1.Asset{ - Id: "test-asset-456", - Name: "rootfs-" + dockerImage, - Type: assetv1.AssetType_ASSET_TYPE_ROOTFS, - Status: assetv1.AssetStatus_ASSET_STATUS_AVAILABLE, - Labels: labels, - }) - } - case <-ctx.Done(): - buildInfo.Status = "failed" - buildInfo.ErrorMessage = "context cancelled" - } - } - - resp.TriggeredBuilds = append(resp.TriggeredBuilds, buildInfo) - } - - return resp, nil -} - -func (m *mockAssetClient) ListAssets(ctx context.Context, assetType assetv1.AssetType, labels map[string]string) ([]*assetv1.Asset, error) { - // Not used in this test - return []*assetv1.Asset{}, nil -} - -func (m *mockAssetClient) PrepareAssets(ctx context.Context, assetIDs []string, targetPath string, vmID string) (map[string]string, error) { - // Return mock paths - paths := make(map[string]string) - for _, id := range assetIDs { - paths[id] = targetPath + "/asset-" + id - } - return paths, nil -} - -func (m *mockAssetClient) AcquireAsset(ctx context.Context, assetID string, vmID string) (string, error) { - return "lease-" + assetID, nil -} - -func (m *mockAssetClient) ReleaseAsset(ctx context.Context, leaseID string) error { - return nil -} - -// TestAutomaticAssetBuilding tests the automatic build flow -func TestAutomaticAssetBuilding(t *testing.T) { - // AIDEV-NOTE: This test verifies the complete automatic build flow: - // 1. VM requests rootfs with docker_image label - // 2. Asset doesn't exist, so QueryAssets triggers a build - // 3. Build completes and asset is registered - // 4. VM uses the newly built asset - - tests := []struct { - name string - dockerImage string - tenantID string - triggerBuild bool - buildDelay time.Duration - buildError error - expectError bool - expectBuild bool - }{ - { - name: "successful automatic build", - dockerImage: "alpine:latest", - tenantID: "test-tenant", - triggerBuild: true, - buildDelay: 100 * time.Millisecond, - expectBuild: true, - }, - { - name: "build failure", - dockerImage: "invalid:image", - tenantID: "test-tenant", - triggerBuild: true, - buildDelay: 100 * time.Millisecond, - buildError: context.DeadlineExceeded, - expectError: true, - expectBuild: true, - }, - { - name: "no automatic build when disabled", - dockerImage: "alpine:latest", - tenantID: "test-tenant", - triggerBuild: false, - expectError: true, // Should fail due to missing asset - expectBuild: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create mock asset client - mockClient := &mockAssetClient{ - triggerBuild: tt.triggerBuild, - buildDelay: tt.buildDelay, - buildError: tt.buildError, - } - - // Create SDK client with mock - logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug})) - client := &SDKClientV4{ - assetClient: mockClient, - logger: logger, - jailerConfig: &config.JailerConfig{ - ChrootBaseDir: "/tmp/test-jailer", - UID: 1000, - GID: 1000, - }, - } - - // Create VM config with docker_image metadata - vmConfig := &metaldv1.VmConfig{ - Boot: &metaldv1.BootConfig{ - KernelPath: "/test/kernel", - }, - Storage: []*metaldv1.StorageDevice{ - { - Path: "", // Should be populated by asset - ReadOnly: false, - IsRootDevice: true, - }, - }, - Metadata: map[string]string{ - "docker_image": tt.dockerImage, - "tenant_id": tt.tenantID, - }, - } - - // Test prepareVMAssets which triggers the automatic build - ctx := context.Background() - assetMapping, paths, err := client.prepareVMAssets(ctx, "test-vm-123", vmConfig) - - // Check error expectation - if tt.expectError { - if err == nil { - t.Errorf("expected error but got none") - } - } else { - if err != nil { - t.Errorf("unexpected error: %v", err) - } - } - - // Verify QueryAssets was called with correct parameters - if len(mockClient.queryCalls) == 0 { - t.Fatal("QueryAssets was not called") - } - - lastCall := mockClient.queryCalls[len(mockClient.queryCalls)-1] - - // Check asset type - if lastCall.assetType != assetv1.AssetType_ASSET_TYPE_ROOTFS { - t.Errorf("expected ASSET_TYPE_ROOTFS, got %v", lastCall.assetType) - } - - // Check docker_image label - if lastCall.labels["docker_image"] != tt.dockerImage { - t.Errorf("expected docker_image=%s, got %s", tt.dockerImage, lastCall.labels["docker_image"]) - } - - // Check build options - if lastCall.buildOpts == nil { - t.Fatal("build options were not provided") - } - - if !lastCall.buildOpts.EnableAutoBuild { - t.Error("expected EnableAutoBuild to be true") - } - - if !lastCall.buildOpts.WaitForCompletion { - t.Error("expected WaitForCompletion to be true") - } - - if lastCall.buildOpts.TenantId != tt.tenantID { - t.Errorf("expected tenant_id=%s, got %s", tt.tenantID, lastCall.buildOpts.TenantId) - } - - // If successful, verify asset mapping - if !tt.expectError && assetMapping != nil { - if len(assetMapping.assets) == 0 { - t.Error("expected assets in mapping but got none") - } - - if len(paths) == 0 { - t.Error("expected prepared paths but got none") - } - } - }) - } -} - -// TestAutomaticBuildTimeout tests build timeout handling -func TestAutomaticBuildTimeout(t *testing.T) { - // Create mock that simulates a long build - mockClient := &mockAssetClient{ - triggerBuild: true, - buildDelay: 5 * time.Second, // Longer than our context timeout - } - - logger := slog.New(slog.NewTextHandler(os.Stderr, nil)) - client := &SDKClientV4{ - assetClient: mockClient, - logger: logger, - jailerConfig: &config.JailerConfig{ - ChrootBaseDir: "/tmp/test-jailer", - UID: 1000, - GID: 1000, - }, - } - - vmConfig := &metaldv1.VmConfig{ - Boot: &metaldv1.BootConfig{ - KernelPath: "/test/kernel", - }, - Storage: []*metaldv1.StorageDevice{ - {IsRootDevice: true}, - }, - Metadata: map[string]string{ - "docker_image": "slow:build", - }, - } - - // Use a short timeout context - ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) - defer cancel() - - // This should timeout - _, _, err := client.prepareVMAssets(ctx, "test-vm-timeout", vmConfig) - - if err == nil { - t.Error("expected timeout error but got none") - } -} diff --git a/go/deploy/metald/internal/backend/firecracker/boot.go b/go/deploy/metald/internal/backend/firecracker/boot.go new file mode 100644 index 0000000000..c6baef6064 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/boot.go @@ -0,0 +1,165 @@ +package firecracker + +import ( + "context" + "fmt" + "log/slog" + "path/filepath" + + sdk "github.com/firecracker-microvm/firecracker-go-sdk" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// BootVM starts a created VM using our integrated jailer +func (c *Client) BootVM(ctx context.Context, vmID string) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.boot_vm", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("operation", "boot"), + attribute.String("error", "vm_not_found"), + )) + return err + } + + // Validate VM state before boot operation + // TODO: This should also boot stopped/paused VMs at some point + if vm.State != metaldv1.VmState_VM_STATE_CREATED { + err := fmt.Errorf("vm %s is in %s state, can only boot VMs in CREATED state", vmID, vm.State.String()) + span.RecordError(err) + c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("operation", "boot"), + attribute.String("error", "invalid_state_transition"), + attribute.String("current_state", vm.State.String()), + )) + return err + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "booting VM", + slog.String("vm_id", vmID), + slog.String("current_state", vm.State.String()), + ) + + // Load VM metadata + metadata, err := c.prepareVMBootMetadata(ctx, vmID, vm) + if err != nil { + c.logger.WarnContext(ctx, "failed to prepare boot metadata", + "vm_id", vmID, + "error", err, + ) + // Continue without metadata rather than failing the boot + } + + // Build and configure firecracker + fcConfig := c.configureFirecrackerForBoot(ctx, vmID, vm, metadata) + + // Create a context for this VM + vmCtx, cancel := context.WithCancel(context.Background()) + vm.CancelFunc = cancel + + // Create and start the machine using SDK + machine, err := c.startFirecrackerMachine(vmCtx, fcConfig) + if err != nil { + cancel() + span.RecordError(err) + c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("operation", "boot"), + attribute.String("error", "start_machine"), + )) + return err + } + + // Update VM state + vm.Machine = machine + vm.State = metaldv1.VmState_VM_STATE_RUNNING + + // Acquire asset leases after successful boot + c.acquireAssetLeases(ctx, vmID, vm.AssetMapping) + + c.vmBootCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("status", "success"), + )) + + c.logger.LogAttrs(ctx, slog.LevelInfo, "VM booted successfully", + slog.String("vm_id", vmID), + ) + + return nil +} + +// prepareVMBootMetadata loads container metadata and prepares port mappings for VM boot +func (c *Client) prepareVMBootMetadata(ctx context.Context, vmID string, vm *VM) (*builderv1.ImageMetadata, error) { + var metadata *builderv1.ImageMetadata + + disk := vm.Config.GetStorage() + if disk.GetIsRootDevice() { + // Use chroot path for metadata loading since assets are copied there + jailerRoot := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID, "root") + chrootRootfsPath := filepath.Join(jailerRoot, "rootfs.ext4") + + m, metadataErr := c.loadContainerMetadata(ctx, chrootRootfsPath) + if metadataErr != nil { + return nil, fmt.Errorf("failed to load container metadata: %w", metadataErr) + } + + if m != nil { + metadata = m + + // Create /container.cmd file for metald-init + if cmdFileErr := c.createContainerCmdFile(ctx, vmID, metadata); cmdFileErr != nil { + return nil, fmt.Errorf("failed to create container.cmd file: %w", cmdFileErr) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "loaded metadata for VM boot", + slog.String("vm_id", vmID), + slog.String("metadata", metadata.String()), + ) + } + } + + return metadata, nil +} + +// configureFirecrackerForBoot builds and configures firecracker for VM boot +func (c *Client) configureFirecrackerForBoot(ctx context.Context, vmID string, vm *VM, metadata *builderv1.ImageMetadata) sdk.Config { + vmDir := filepath.Join(c.baseDir, vmID) + socketPath := filepath.Join(vmDir, "firecracker.sock") + + // Build firecracker config + fcConfig := c.buildFirecrackerConfig(ctx, vmID, vm.Config, vm.NetworkInfo, vm.AssetPaths) + fcConfig.SocketPath = socketPath + + // Update kernel args with network configuration and metadata if available + fcConfig.KernelArgs = c.BuildKernelArgs(ctx, vm.NetworkInfo, metadata) + + // Set the network namespace for the SDK to use + if vm.NetworkInfo != nil && vm.NetworkInfo.Namespace != "" { + fcConfig.NetNS = filepath.Join("/run/netns", vm.NetworkInfo.Namespace) + } + + return fcConfig +} + +// startFirecrackerMachine creates and starts the firecracker machine +func (c *Client) startFirecrackerMachine(ctx context.Context, fcConfig sdk.Config) (*sdk.Machine, error) { + machine, err := sdk.NewMachine(ctx, fcConfig) + if err != nil { + return nil, fmt.Errorf("failed to create firecracker machine: %w", err) + } + + if err := machine.Start(ctx); err != nil { + return nil, fmt.Errorf("failed to start firecracker machine: %w", err) + } + + return machine, nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/client.go b/go/deploy/metald/internal/backend/firecracker/client.go new file mode 100644 index 0000000000..bdeea7cf74 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/client.go @@ -0,0 +1,93 @@ +package firecracker + +import ( + "context" + "fmt" + "log/slog" + + "github.com/unkeyed/unkey/go/deploy/metald/internal/assetmanager" + "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" + "github.com/unkeyed/unkey/go/deploy/metald/internal/config" + "github.com/unkeyed/unkey/go/deploy/metald/internal/jailer" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +// NewClient creates a new SDK-based Firecracker backend client with integrated jailer +func NewClient(logger *slog.Logger, + assetClient assetmanager.Client, + jailerConfig *config.JailerConfig, + baseDir string, +) (*Client, error) { + tracer := otel.Tracer("metald.firecracker") + meter := otel.Meter("metald.firecracker") + + vmCreateCounter, err := meter.Int64Counter("vm_create_total", + metric.WithDescription("Total number of VM create operations"), + metric.WithUnit("1"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create vm_create counter: %w", err) + } + + vmDeleteCounter, err := meter.Int64Counter("vm_delete_total", + metric.WithDescription("Total number of VM delete operations"), + metric.WithUnit("1"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create vm_delete counter: %w", err) + } + + vmBootCounter, err := meter.Int64Counter("vm_boot_total", + metric.WithDescription("Total number of VM boot operations"), + metric.WithUnit("1"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create vm_boot counter: %w", err) + } + + vmErrorCounter, err := meter.Int64Counter("vm_error_total", + metric.WithDescription("Total number of VM operation errors"), + metric.WithUnit("1"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create vm_error counter: %w", err) + } + + // Create integrated jailer + integratedJailer := jailer.NewJailer(logger, jailerConfig) + + return &Client{ + logger: logger.With("backend", "firecracker"), + assetClient: assetClient, + vmAssetLeases: make(map[string][]string), + jailer: integratedJailer, + jailerConfig: jailerConfig, + baseDir: baseDir, + tracer: tracer, + meter: meter, + vmCreateCounter: vmCreateCounter, + vmDeleteCounter: vmDeleteCounter, + vmBootCounter: vmBootCounter, + vmErrorCounter: vmErrorCounter, + }, nil +} + +// Ping verifies the backend is operational +func (c *Client) Ping(ctx context.Context) error { + c.logger.DebugContext(ctx, "pinging firecracker backend") + return nil +} + +// Shutdown gracefully shuts down the SDK client while preserving VMs +func (c *Client) Shutdown(ctx context.Context) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.shutdown") + defer span.End() + + c.logger.InfoContext(ctx, "shutting down firecracker backend") + + return nil +} + +// Ensure Client implements Backend interface +var _ types.Backend = (*Client)(nil) diff --git a/go/deploy/metald/internal/backend/firecracker/config_builder.go b/go/deploy/metald/internal/backend/firecracker/config_builder.go new file mode 100644 index 0000000000..ed3bcc98d9 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/config_builder.go @@ -0,0 +1,160 @@ +package firecracker + +import ( + "context" + "log/slog" + "os" + "path/filepath" + + sdk "github.com/firecracker-microvm/firecracker-go-sdk" + "github.com/firecracker-microvm/firecracker-go-sdk/client/models" + "github.com/unkeyed/unkey/go/deploy/metald/internal/network" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "golang.org/x/sys/unix" +) + +// buildFirecrackerConfig builds the SDK configuration without jailer +func (c *Client) buildFirecrackerConfig(ctx context.Context, vmID string, config *metaldv1.VmConfig, networkInfo *network.VMNetwork, preparedPaths map[string]string) sdk.Config { + // For integrated jailer, we use absolute paths since we're not running inside chroot + // The assets are still in the jailer directory structure for consistency + jailerRoot := filepath.Join( + c.jailerConfig.ChrootBaseDir, + "firecracker", + vmID, + "root", + ) + + socketPath := "/firecracker.sock" + + // Determine kernel path - use prepared path if available, otherwise fallback to default + kernelPath := filepath.Join(jailerRoot, "vmlinux") + if len(preparedPaths) > 0 { + // In a more sophisticated implementation, we'd track which asset ID + // corresponds to which component (kernel vs rootfs). For now, we rely on the + // assetmanager preparing files with standard names in the target directory. + c.logger.LogAttrs(ctx, slog.LevelDebug, "using prepared asset paths", + slog.String("vm_id", vmID), + slog.Int("path_count", len(preparedPaths)), + ) + } + + // Setup metrics FIFO for billaged + metricsPath := c.setupMetricsFIFO(ctx, vmID, jailerRoot) + + // Setup console logging + consoleLogPath := filepath.Join(jailerRoot, "console.log") + consoleFifoPath := filepath.Join(jailerRoot, "console.fifo") + + // Use the kernel args as provided by the caller + // Metadata handling is now done in BootVM + kernelArgs := config.GetBoot() + + // Build the configuration + cfg := c.buildSDKConfig( + socketPath, + consoleLogPath, + consoleFifoPath, + metricsPath, + kernelPath, + kernelArgs, + config, + jailerRoot, + ) + + // Add network interface + if networkInfo != nil { + c.addNetworkInterfaceToConfig(&cfg, networkInfo) + } + + return cfg +} + +// setupMetricsFIFO creates the metrics FIFO for billaged to read Firecracker stats +func (c *Client) setupMetricsFIFO(ctx context.Context, vmID string, jailerRoot string) string { + metricsPath := filepath.Join(jailerRoot, "metrics.fifo") + hostMetricsPath := filepath.Join(jailerRoot, "metrics.fifo") + + // Create the metrics FIFO in the host filesystem + if err := unix.Mkfifo(hostMetricsPath, 0o644); err != nil && !os.IsExist(err) { + c.logger.ErrorContext(ctx, "failed to create metrics FIFO", + slog.String("vm_id", vmID), + slog.String("path", hostMetricsPath), + slog.String("error", err.Error()), + ) + } else { + c.logger.InfoContext(ctx, "created metrics FIFO for billaged", + slog.String("vm_id", vmID), + slog.String("host_path", hostMetricsPath), + slog.String("chroot_path", metricsPath), + ) + } + + return metricsPath +} + +// buildSDKConfig builds the base SDK configuration +func (c *Client) buildSDKConfig( + socketPath string, + consoleLogPath string, + consoleFifoPath string, + metricsPath string, + kernelPath string, + kernelArgs string, + config *metaldv1.VmConfig, + jailerRoot string, +) sdk.Config { + // Create the console log file to capture guest output + consoleLogFile, err := os.OpenFile(consoleLogPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644) + + var cfg sdk.Config + if err != nil { + // Fall back to LogPath only if console log file creation fails + c.logger.Warn("failed to create console log file, falling back to LogPath only", + slog.String("error", err.Error()), + slog.String("console_log_path", consoleLogPath), + ) + cfg = sdk.Config{ + SocketPath: socketPath, + LogPath: consoleLogPath, // Captures Firecracker logs only + LogLevel: "Debug", + MetricsPath: metricsPath, + KernelImagePath: kernelPath, + KernelArgs: kernelArgs, + MachineCfg: models.MachineConfiguration{ + VcpuCount: sdk.Int64(int64(config.GetVcpuCount())), + MemSizeMib: sdk.Int64(536870912), + Smt: sdk.Bool(false), + }, + } + } else { + // Successful case - capture guest console output via FIFO + cfg = sdk.Config{ + SocketPath: socketPath, + LogPath: filepath.Join(jailerRoot, "firecracker.log"), // Firecracker's own logs + LogFifo: consoleFifoPath, // FIFO for guest console output + FifoLogWriter: consoleLogFile, // Writer to capture guest console to file + LogLevel: "Debug", + MetricsPath: metricsPath, + KernelImagePath: kernelPath, + KernelArgs: kernelArgs, + MachineCfg: models.MachineConfiguration{ + VcpuCount: sdk.Int64(int64(config.GetVcpuCount())), + MemSizeMib: sdk.Int64(536870912), + Smt: sdk.Bool(false), + }, + } + } + + return cfg +} + +// addNetworkInterfaceToConfig adds network interface to the Firecracker configuration +func (c *Client) addNetworkInterfaceToConfig(cfg *sdk.Config, networkInfo *network.VMNetwork) { + iface := sdk.NetworkInterface{ + StaticConfiguration: &sdk.StaticNetworkConfiguration{ + HostDevName: networkInfo.TapDevice, + MacAddress: networkInfo.MacAddress, + }, + } + cfg.NetworkInterfaces = []sdk.NetworkInterface{iface} +} diff --git a/go/deploy/metald/internal/backend/firecracker/create.go b/go/deploy/metald/internal/backend/firecracker/create.go new file mode 100644 index 0000000000..2c0273db99 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/create.go @@ -0,0 +1,60 @@ +package firecracker + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// CreateVM creates a new VM using the SDK with integrated jailer +func (c *Client) CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.create_vm", + trace.WithAttributes( + attribute.Int("vcpus", int(config.GetVcpuCount())), + attribute.Int64("memory_bytes", int64(config.GetMemorySizeMib())), + ), + ) + defer span.End() + + c.logger.LogAttrs(ctx, slog.LevelInfo, "creating VM", + slog.String("vm_id", config.GetId()), + slog.Int("vcpus", int(config.GetVcpuCount())), + slog.Int64("memory_bytes", int64(config.GetMemorySizeMib())), + ) + + // Create VM directory + vmDir := filepath.Join(c.baseDir, config.GetId()) + if err := os.MkdirAll(vmDir, 0o755); err != nil { + return "", fmt.Errorf("failed to create VM directory: %w", err) + } + + c.logger.DebugContext(ctx, "created VM directory", + slog.String("directory", vmDir), + ) + + // Register the VM + vm := &VM{ + ID: config.GetId(), + Config: config, + State: metaldv1.VmState_VM_STATE_CREATED, + Machine: nil, // Will be set when we boot + CancelFunc: nil, // Will be set when we boot + } + + c.vmCreateCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("status", "success"), + )) + + c.logger.LogAttrs(ctx, slog.LevelInfo, "vm created", + slog.String("vm_id", vm.ID), + ) + + return vm.ID, nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/delete.go b/go/deploy/metald/internal/backend/firecracker/delete.go new file mode 100644 index 0000000000..0882409d21 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/delete.go @@ -0,0 +1,87 @@ +package firecracker + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// DeleteVM deletes a VM and cleans up all associated resources +func (c *Client) DeleteVM(ctx context.Context, vmID string) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.delete_vm", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + c.logger.LogAttrs(ctx, slog.LevelInfo, "deleting VM", + slog.String("vm_id", vmID), + ) + + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("operation", "delete"), + attribute.String("error", "vm_not_found"), + )) + return err + } + + // Stop the VM if it's running + if vm.Machine != nil { + if err := vm.Machine.StopVMM(); err != nil { + c.logger.WarnContext(ctx, "failed to stop VMM during delete", + "vm_id", vmID, + "error", err, + ) + } + + // Cancel the VM context + if vm.CancelFunc != nil { + vm.CancelFunc() + } + } + + // Clean up VM directory + vmDir := filepath.Join(c.baseDir, vmID) + if err := os.RemoveAll(vmDir); err != nil { + c.logger.WarnContext(ctx, "failed to remove VM directory", + "vm_id", vmID, + "path", vmDir, + "error", err, + ) + } + + // Clean up jailer chroot + chrootPath := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID) + if err := os.RemoveAll(chrootPath); err != nil { + c.logger.WarnContext(ctx, "failed to remove jailer chroot", + "vm_id", vmID, + "path", chrootPath, + "error", err, + ) + } + + // Release asset leases + c.releaseAssetLeases(ctx, vmID) + + // Remove from registry + delete(c.vmRegistry, vmID) + + c.vmDeleteCounter.Add(ctx, 1, metric.WithAttributes( + attribute.String("status", "success"), + )) + + c.logger.LogAttrs(ctx, slog.LevelInfo, "VM deleted successfully", + slog.String("vm_id", vmID), + ) + + return nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/kernel_args.go b/go/deploy/metald/internal/backend/firecracker/kernel_args.go new file mode 100644 index 0000000000..6d30ef7538 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/kernel_args.go @@ -0,0 +1,71 @@ +package firecracker + +import ( + "context" + "fmt" + "log/slog" + "strings" + + "github.com/unkeyed/unkey/go/deploy/metald/internal/network" + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" +) + +const ( + // Static kernel parameters that never change for Firecracker VMs + // AIDEV-BUSINESS_RULE: metald-init is always our init process + staticKernelArgs = "console=ttyS0 reboot=k panic=1 pci=off init=/usr/bin/metald-init root=/dev/vda rw" + + // Debug parameters - only add when explicitly enabled + debugKernelArgs = "loglevel=8 earlyprintk=serial,ttyS0,115200 debug ignore_loglevel printk.devkmsg=on" +) + +// BuildKernelArgs generates kernel arguments for VM boot +func (c *Client) BuildKernelArgs(ctx context.Context, networkInfo *network.VMNetwork, metadata *builderv1.ImageMetadata) string { + args := []string{staticKernelArgs} + + // Format: ip=G::T:GM::GI:off + // G = Guest IP, T = TAP IP, GM = Guest Mask, GI = Guest Interface + ipArg := fmt.Sprintf("ip=%s::%s:%s:%s:off", + networkInfo.IPAddress, + networkInfo.Gateway, + networkInfo.Netmask, + "eth0", + ) + + args = append(args, ipArg) + + // Add container metadata if available (metald-init will use these) + if metadata != nil { + // Add container environment variables (if needed) + for key, value := range metadata.GetEnv() { + // Skip PATH and anything with spaces to avoid kernel cmdline parsing issues + if key == "PATH" || strings.Contains(key, " ") || strings.Contains(value, " ") { + continue + } + args = append(args, fmt.Sprintf("env.%s=%s", key, value)) + } + + // Add working directory if specified + if workdir := metadata.GetWorkingDir(); workdir != "" { + args = append(args, fmt.Sprintf("workdir=%s", workdir)) + } + } + + finalArgs := strings.Join(args, " ") + + c.logger.LogAttrs(ctx, slog.LevelDebug, "built kernel args", + slog.String("vm_id", getVMID(networkInfo)), + slog.Bool("has_network", networkInfo != nil), + slog.Bool("has_metadata", metadata != nil), + slog.String("args", finalArgs), + ) + + return finalArgs +} + +func getVMID(networkInfo *network.VMNetwork) string { + if networkInfo != nil { + return networkInfo.VMID + } + return "unknown" +} diff --git a/go/deploy/metald/internal/backend/firecracker/metadata.go b/go/deploy/metald/internal/backend/firecracker/metadata.go new file mode 100644 index 0000000000..13e45fe152 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/metadata.go @@ -0,0 +1,233 @@ +package firecracker + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + + builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" +) + +// loadContainerMetadata loads container metadata from the metadata file if it exists +func (c *Client) loadContainerMetadata(ctx context.Context, rootfsPath string) (*builderv1.ImageMetadata, error) { + // Load container metadata saved by builderd + // The metadata file is named {buildID}.metadata.json and should be alongside the rootfs + + // Extract base name without extension + baseName := strings.TrimSuffix(filepath.Base(rootfsPath), filepath.Ext(rootfsPath)) + metadataPath := filepath.Join(filepath.Dir(rootfsPath), baseName+".metadata.json") + + c.logger.LogAttrs(ctx, slog.LevelInfo, "looking for container metadata", + slog.String("rootfs_path", rootfsPath), + slog.String("metadata_path", metadataPath), + ) + + // Check if metadata file exists + if _, err := os.Stat(metadataPath); os.IsNotExist(err) { + // Fallback to check for metadata.json in VM chroot directory + // When assets are copied to VM chroot by assetmanagerd, metadata file is renamed to metadata.json + fallbackPath := filepath.Join(filepath.Dir(rootfsPath), "metadata.json") + if _, err := os.Stat(fallbackPath); os.IsNotExist(err) { + c.logger.LogAttrs(ctx, slog.LevelDebug, "no metadata file found in either location", + slog.String("primary_path", metadataPath), + slog.String("fallback_path", fallbackPath), + ) + return nil, nil // No metadata is not an error + } + // Use fallback path + metadataPath = fallbackPath + c.logger.LogAttrs(ctx, slog.LevelInfo, "using fallback metadata path", + slog.String("fallback_path", fallbackPath), + ) + } + + // Read metadata file + data, err := os.ReadFile(metadataPath) + if err != nil { + return nil, fmt.Errorf("failed to read metadata file: %w", err) + } + + // Parse metadata + var metadata builderv1.ImageMetadata + if err := json.Unmarshal(data, &metadata); err != nil { + return nil, fmt.Errorf("failed to parse metadata: %w", err) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "loaded container metadata", + slog.String("image", metadata.GetOriginalImage()), + slog.Int("entrypoint_len", len(metadata.GetEntrypoint())), + slog.Int("cmd_len", len(metadata.GetCommand())), + slog.Int("env_vars", len(metadata.GetEnv())), + slog.Int("exposed_ports", len(metadata.GetExposedPorts())), + ) + + return &metadata, nil +} + +// createContainerCmdFile creates /container.cmd file in VM chroot for metald-init +func (c *Client) createContainerCmdFile(ctx context.Context, vmID string, metadata *builderv1.ImageMetadata) error { + // Create container.cmd file containing the full command for metald-init + // Combines entrypoint and command from container metadata into JSON array + + if metadata == nil { + return fmt.Errorf("metadata is required") + } + + // Build full command array: entrypoint + command + var fullCmd []string + fullCmd = append(fullCmd, metadata.GetEntrypoint()...) + fullCmd = append(fullCmd, metadata.GetCommand()...) + + if len(fullCmd) == 0 { + return fmt.Errorf("no entrypoint or command found in metadata") + } + + // Convert to JSON + cmdJSON, err := json.Marshal(fullCmd) + if err != nil { + return fmt.Errorf("failed to marshal command to JSON: %w", err) + } + + // Write container.cmd into the rootfs.ext4 filesystem, not just chroot + // Mount the rootfs.ext4 temporarily to inject the container.cmd file + jailerRoot := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID, "root") + rootfsPath := filepath.Join(jailerRoot, "rootfs.ext4") + + // Create temporary mount point + tmpMount := filepath.Join("/tmp", "rootfs-mount-"+vmID) + if err := os.MkdirAll(tmpMount, 0o755); err != nil { + return fmt.Errorf("failed to create temp mount dir: %w", err) + } + defer os.RemoveAll(tmpMount) + + // Mount the rootfs.ext4 + mountCmd := exec.Command("mount", "-o", "loop", rootfsPath, tmpMount) + if err := mountCmd.Run(); err != nil { + return fmt.Errorf("failed to mount rootfs: %w", err) + } + defer func() { + umountCmd := exec.Command("umount", tmpMount) + umountCmd.Run() + }() + + // Write container.cmd into the mounted filesystem + containerCmdPath := filepath.Join(tmpMount, "container.cmd") + if err := os.WriteFile(containerCmdPath, cmdJSON, 0o600); err != nil { + return fmt.Errorf("failed to write container.cmd to rootfs: %w", err) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "created container.cmd file", + slog.String("vm_id", vmID), + slog.String("path", containerCmdPath), + slog.String("command", string(cmdJSON)), + ) + + return nil +} + +// copyMetadataFilesForAssets copies metadata files alongside rootfs assets when using asset manager +func (c *Client) copyMetadataFilesForAssets(ctx context.Context, vmID string, config *metaldv1.VmConfig, preparedPaths map[string]string, jailerRoot string) error { + // When using asset manager, only rootfs files are copied, but we need metadata files too + // This function finds the original metadata files and copies them to the jailer root + + disk := config.GetStorage() + if !disk.GetIsRootDevice() || disk.GetPath() == "" { + } + + // Find the original rootfs path before asset preparation + originalRootfsPath := disk.GetPath() + + // Check if this disk was replaced by an asset + var preparedRootfsPath string + for _, path := range preparedPaths { + if strings.HasSuffix(path, ".ext4") || strings.HasSuffix(path, ".img") { + preparedRootfsPath = path + break + } + } + + if preparedRootfsPath == "" { + // No rootfs asset found, skip metadata copying + } + + // Look for metadata file alongside the original rootfs + originalDir := filepath.Dir(originalRootfsPath) + originalBaseName := strings.TrimSuffix(filepath.Base(originalRootfsPath), filepath.Ext(originalRootfsPath)) + metadataSrcPath := filepath.Join(originalDir, originalBaseName+".metadata.json") + + // Check if metadata file exists + if _, err := os.Stat(metadataSrcPath); os.IsNotExist(err) { + c.logger.LogAttrs(ctx, slog.LevelDebug, "no metadata file found for asset", + slog.String("vm_id", vmID), + slog.String("original_rootfs", originalRootfsPath), + slog.String("expected_metadata", metadataSrcPath), + ) + } + + // Copy metadata file to jailer root with the same base name as the prepared rootfs + preparedBaseName := strings.TrimSuffix(filepath.Base(preparedRootfsPath), filepath.Ext(preparedRootfsPath)) + metadataDstPath := filepath.Join(jailerRoot, preparedBaseName+".metadata.json") + + if err := copyFileWithOwnership(metadataSrcPath, metadataDstPath, int(c.jailerConfig.UID), int(c.jailerConfig.GID)); err != nil { + c.logger.WarnContext(ctx, "failed to copy metadata file", + slog.String("vm_id", vmID), + slog.String("src", metadataSrcPath), + slog.String("dst", metadataDstPath), + slog.String("error", err.Error()), + ) + return fmt.Errorf("failed to copy metadata file %s: %w", metadataSrcPath, err) + } + + c.logger.InfoContext(ctx, "copied metadata file for asset", + slog.String("vm_id", vmID), + slog.String("src", metadataSrcPath), + slog.String("dst", metadataDstPath), + ) + + return nil +} + +// copyFileWithOwnership copies files with ownership +func copyFileWithOwnership(src, dst string, uid, gid int) error { + // Use cp command to handle large files efficiently + cmd := exec.Command("cp", "-f", src, dst) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("cp command failed: %w, output: %s", err, output) + } + + // Set permissions + if err := os.Chmod(dst, 0o644); err != nil { + return fmt.Errorf("failed to set permissions on %s: %w", dst, err) + } + + // Set ownership + if err := os.Chown(dst, uid, gid); err != nil { + // Log but don't fail - might work anyway + return nil + } + + return nil +} + +// validateIPAddress validates an IP address to prevent command injection +func validateIPAddress(ip string) error { + if net.ParseIP(ip) == nil { + return fmt.Errorf("invalid IP address: %s", ip) + } + return nil +} + +// validatePortNumber validates a port number to prevent command injection +func validatePortNumber(port int) error { + if port < 1 || port > 65535 { + return fmt.Errorf("invalid port number: %d, must be between 1-65535", port) + } + return nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/metrics.go b/go/deploy/metald/internal/backend/firecracker/metrics.go new file mode 100644 index 0000000000..cc2b103fb1 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/metrics.go @@ -0,0 +1,149 @@ +package firecracker + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + + "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" +) + +// GetVMMetrics retrieves metrics for a specific VM +func (c *Client) GetVMMetrics(ctx context.Context, vmID string) (*types.VMMetrics, error) { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.get_vm_metrics", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + c.logger.LogAttrs(ctx, slog.LevelDebug, "retrieving VM metrics", + slog.String("vm_id", vmID), + ) + + // Get the VM from registry + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + return nil, err + } + + // Check if VM has a machine instance + if vm.Machine == nil { + err := fmt.Errorf("vm %s has no firecracker process", vmID) + span.RecordError(err) + return nil, err + } + + // Calculate the jailer root path + jailerRoot := filepath.Join( + c.jailerConfig.ChrootBaseDir, + "firecracker", + vmID, + "root", + ) + + // Read metrics from the FIFO + metricsPath := filepath.Join(jailerRoot, "metrics.fifo") + metrics, err := c.readFirecrackerMetrics(ctx, metricsPath) + if err != nil { + span.RecordError(err) + return nil, fmt.Errorf("failed to read metrics: %w", err) + } + + return metrics, nil +} + +// readFirecrackerMetrics reads metrics from Firecracker's metrics FIFO +func (c *Client) readFirecrackerMetrics(ctx context.Context, metricsPath string) (*types.VMMetrics, error) { + c.logger.LogAttrs(ctx, slog.LevelDebug, "reading firecracker metrics", + slog.String("metrics_path", metricsPath), + ) + + // Open the metrics FIFO (non-blocking read) + file, err := os.OpenFile(metricsPath, os.O_RDONLY, 0) + if err != nil { + return nil, fmt.Errorf("failed to open metrics FIFO: %w", err) + } + defer file.Close() + + // Read all available data from the FIFO + data, err := io.ReadAll(file) + if err != nil && err != io.EOF { + return nil, fmt.Errorf("failed to read metrics data: %w", err) + } + + // If no data available, return empty metrics + if len(data) == 0 { + c.logger.LogAttrs(ctx, slog.LevelDebug, "no metrics data available", + slog.String("metrics_path", metricsPath), + ) + return &types.VMMetrics{}, nil + } + + // Parse the JSON metrics + var rawMetrics map[string]interface{} + if err := json.Unmarshal(data, &rawMetrics); err != nil { + return nil, fmt.Errorf("failed to parse metrics JSON: %w", err) + } + + // Convert raw metrics to structured format + metrics := c.parseRawMetrics(rawMetrics) + + c.logger.LogAttrs(ctx, slog.LevelDebug, "successfully read VM metrics", + slog.Int64("cpu_time_ns", metrics.CpuTimeNanos), + slog.Int64("memory_usage_bytes", metrics.MemoryUsageBytes), + slog.Int64("disk_read_bytes", metrics.DiskReadBytes), + slog.Int64("disk_write_bytes", metrics.DiskWriteBytes), + slog.Int64("network_rx_bytes", metrics.NetworkRxBytes), + slog.Int64("network_tx_bytes", metrics.NetworkTxBytes), + ) + + return metrics, nil +} + +// parseRawMetrics converts raw Firecracker metrics to structured format +func (c *Client) parseRawMetrics(raw map[string]interface{}) *types.VMMetrics { + metrics := &types.VMMetrics{} + + // Extract CPU metrics + if cpu, ok := raw["cpu"].(map[string]interface{}); ok { + if cpuTimeNs, ok := cpu["cpu_time_ms"].(float64); ok { + metrics.CpuTimeNanos = int64(cpuTimeNs) + } + } + + // Extract memory metrics + if memory, ok := raw["memory"].(map[string]interface{}); ok { + if memUsageBytes, ok := memory["memory_usage_bytes"].(float64); ok { + metrics.MemoryUsageBytes = int64(memUsageBytes) + } + } + + // Extract disk metrics + if disk, ok := raw["disk"].(map[string]interface{}); ok { + if readBytes, ok := disk["read_bytes"].(float64); ok { + metrics.DiskReadBytes = int64(readBytes) + } + if writeBytes, ok := disk["write_bytes"].(float64); ok { + metrics.DiskWriteBytes = int64(writeBytes) + } + } + + // Extract network metrics + if network, ok := raw["network"].(map[string]interface{}); ok { + if rxBytes, ok := network["rx_bytes"].(float64); ok { + metrics.NetworkRxBytes = int64(rxBytes) + } + if txBytes, ok := network["tx_bytes"].(float64); ok { + metrics.NetworkTxBytes = int64(txBytes) + } + } + + return metrics +} diff --git a/go/deploy/metald/internal/backend/firecracker/sdk_client_v4.go b/go/deploy/metald/internal/backend/firecracker/sdk_client_v4.go deleted file mode 100644 index 8369c283f1..0000000000 --- a/go/deploy/metald/internal/backend/firecracker/sdk_client_v4.go +++ /dev/null @@ -1,2167 +0,0 @@ -package firecracker - -import ( - "context" - "crypto/rand" - "crypto/sha256" - "encoding/hex" - "encoding/json" - "fmt" - "log/slog" - "os" - "os/exec" - "path/filepath" - "sort" - "strings" - "time" - - sdk "github.com/firecracker-microvm/firecracker-go-sdk" - "github.com/firecracker-microvm/firecracker-go-sdk/client/models" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" - builderv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/builderd/v1" - "github.com/unkeyed/unkey/go/deploy/metald/internal/assetmanager" - "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" - "github.com/unkeyed/unkey/go/deploy/metald/internal/config" - "github.com/unkeyed/unkey/go/deploy/metald/internal/jailer" - "github.com/unkeyed/unkey/go/deploy/metald/internal/network" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/codes" - "go.opentelemetry.io/otel/metric" - "go.opentelemetry.io/otel/trace" - "golang.org/x/sys/unix" -) - -// sdkV4VM represents a VM managed by the SDK v4 -type sdkV4VM struct { - ID string - Config *metaldv1.VmConfig - State metaldv1.VmState - Machine *sdk.Machine - NetworkInfo *network.VMNetwork - CancelFunc context.CancelFunc - AssetMapping *assetMapping // Asset mapping for lease acquisition - AssetPaths map[string]string // Prepared asset paths - PortMappings []network.PortMapping // Port forwarding configuration -} - -// SDKClientV4 implements the Backend interface using firecracker-go-sdk -// with integrated jailer functionality for secure VM isolation. -// -// AIDEV-NOTE: This was previously named SDKClientV4Jailerless which was confusing -// because it DOES use a jailer - just the integrated one, not the external binary. -// The integrated jailer solves tap device permission issues and provides better -// control over the isolation process. -type SDKClientV4 struct { - logger *slog.Logger - networkManager *network.Manager - assetClient assetmanager.Client - vmRepo VMRepository // For port mapping persistence - vmRegistry map[string]*sdkV4VM - vmAssetLeases map[string][]string // VM ID -> asset lease IDs - jailer *jailer.Jailer - jailerConfig *config.JailerConfig - baseDir string - tracer trace.Tracer - meter metric.Meter - vmCreateCounter metric.Int64Counter - vmDeleteCounter metric.Int64Counter - vmBootCounter metric.Int64Counter - vmErrorCounter metric.Int64Counter -} - -// VMRepository defines the interface for VM database operations needed by the backend -type VMRepository interface { - UpdateVMPortMappingsWithContext(ctx context.Context, vmID string, portMappingsJSON string) error -} - -// NewSDKClientV4 creates a new SDK-based Firecracker backend client with integrated jailer -func NewSDKClientV4(logger *slog.Logger, networkManager *network.Manager, assetClient assetmanager.Client, vmRepo VMRepository, jailerConfig *config.JailerConfig, baseDir string) (*SDKClientV4, error) { - tracer := otel.Tracer("metald.firecracker.sdk.v4") - meter := otel.Meter("metald.firecracker.sdk.v4") - - vmCreateCounter, err := meter.Int64Counter("vm_create_total", - metric.WithDescription("Total number of VM create operations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_create counter: %w", err) - } - - vmDeleteCounter, err := meter.Int64Counter("vm_delete_total", - metric.WithDescription("Total number of VM delete operations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_delete counter: %w", err) - } - - vmBootCounter, err := meter.Int64Counter("vm_boot_total", - metric.WithDescription("Total number of VM boot operations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_boot counter: %w", err) - } - - vmErrorCounter, err := meter.Int64Counter("vm_error_total", - metric.WithDescription("Total number of VM operation errors"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, fmt.Errorf("failed to create vm_error counter: %w", err) - } - - // Create integrated jailer - integratedJailer := jailer.NewJailer(logger, jailerConfig) - - return &SDKClientV4{ - logger: logger.With("backend", "firecracker-sdk-v4"), - networkManager: networkManager, - assetClient: assetClient, - vmRepo: vmRepo, - vmRegistry: make(map[string]*sdkV4VM), - vmAssetLeases: make(map[string][]string), - jailer: integratedJailer, - jailerConfig: jailerConfig, - baseDir: baseDir, - tracer: tracer, - meter: meter, - vmCreateCounter: vmCreateCounter, - vmDeleteCounter: vmDeleteCounter, - vmBootCounter: vmBootCounter, - vmErrorCounter: vmErrorCounter, - }, nil -} - -// Initialize initializes the SDK client -func (c *SDKClientV4) Initialize() error { - ctx, span := c.tracer.Start(context.Background(), "metald.firecracker.initialize") - defer span.End() - - c.logger.InfoContext(ctx, "initializing firecracker SDK v4 client with integrated jailer") - c.logger.InfoContext(ctx, "firecracker SDK v4 client initialized") - return nil -} - -// CreateVM creates a new VM using the SDK with integrated jailer -func (c *SDKClientV4) CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.create_vm", - trace.WithAttributes( - attribute.Int("vcpus", int(config.GetCpu().GetVcpuCount())), - attribute.Int64("memory_bytes", config.GetMemory().GetSizeBytes()), - ), - ) - defer span.End() - - // Generate VM ID - vmID, err := generateV4VMID() - if err != nil { - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "create"), - attribute.String("error", "generate_id"), - )) - return "", fmt.Errorf("failed to generate VM ID: %w", err) - } - span.SetAttributes(attribute.String("vm_id", vmID)) - - c.logger.LogAttrs(ctx, slog.LevelInfo, "creating VM with SDK v4", - slog.String("vm_id", vmID), - slog.Int("vcpus", int(config.GetCpu().GetVcpuCount())), - slog.Int64("memory_bytes", config.GetMemory().GetSizeBytes()), - ) - - // Key difference: Allocate network resources BEFORE creating the jail - // This allows us to create the tap device with full privileges - networkInfo, err := c.networkManager.CreateVMNetwork(ctx, vmID) - if err != nil { - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "create"), - attribute.String("error", "network_allocation"), - )) - return "", fmt.Errorf("failed to allocate network: %w", err) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "allocated network for VM", - slog.String("vm_id", vmID), - slog.String("namespace", networkInfo.Namespace), - slog.String("tap_device", networkInfo.TapDevice), - slog.String("ip_address", networkInfo.IPAddress.String()), - ) - - // Prepare assets in the jailer chroot - assetMapping, preparedPaths, err := c.prepareVMAssets(ctx, vmID, config) - if err != nil { - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "create"), - attribute.String("error", "asset_preparation"), - )) - // Clean up network allocation - if cleanupErr := c.networkManager.DeleteVMNetwork(ctx, vmID); cleanupErr != nil { - c.logger.ErrorContext(ctx, "failed to cleanup network after asset preparation failure", - "vm_id", vmID, - "error", cleanupErr, - ) - } - return "", fmt.Errorf("failed to prepare VM assets: %w", err) - } - - // Build SDK configuration WITHOUT jailer - // The jailer functionality is now integrated - _ = c.buildFirecrackerConfig(ctx, vmID, config, networkInfo, preparedPaths) - - // Create VM directory - vmDir := filepath.Join(c.baseDir, vmID) - if err := os.MkdirAll(vmDir, 0755); err != nil { - return "", fmt.Errorf("failed to create VM directory: %w", err) - } - - // Register the VM - vm := &sdkV4VM{ - ID: vmID, - Config: config, - State: metaldv1.VmState_VM_STATE_CREATED, - Machine: nil, // Will be set when we boot - NetworkInfo: networkInfo, - CancelFunc: nil, // Will be set when we boot - AssetMapping: assetMapping, - AssetPaths: preparedPaths, - } - - c.vmRegistry[vmID] = vm - - c.vmCreateCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("status", "success"), - )) - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM created successfully with SDK v4", - slog.String("vm_id", vmID), - ) - - return vmID, nil -} - -// BootVM starts a created VM using our integrated jailer -func (c *SDKClientV4) BootVM(ctx context.Context, vmID string) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.boot_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "boot"), - attribute.String("error", "vm_not_found"), - )) - return err - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "booting VM with SDK v4", - slog.String("vm_id", vmID), - ) - - // For integrated jailer, we run firecracker in the VM directory - vmDir := filepath.Join(c.baseDir, vmID) - socketPath := filepath.Join(vmDir, "firecracker.sock") - - // Create log files - logPath := filepath.Join(vmDir, "firecracker.log") - logFile, err := os.Create(logPath) - if err != nil { - return fmt.Errorf("failed to create log file: %w", err) - } - defer logFile.Close() - - // Load container metadata and parse port mappings - var metadata *builderv1.ImageMetadata - var portMappings []network.PortMapping - for _, disk := range vm.Config.GetStorage() { - if disk.GetIsRootDevice() { - // AIDEV-NOTE: Use chroot path for metadata loading since assets are copied there - // The original disk path points to asset manager, but metadata.json is in chroot - jailerRoot := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID, "root") - chrootRootfsPath := filepath.Join(jailerRoot, "rootfs.ext4") - - if m, err := c.loadContainerMetadata(ctx, chrootRootfsPath); err != nil { - c.logger.WarnContext(ctx, "failed to load container metadata", - "error", err, - "chroot_rootfs_path", chrootRootfsPath, - ) - } else if m != nil { - metadata = m - - // AIDEV-NOTE: Create /container.cmd file for metald-init - // Combine entrypoint and command into a single JSON array - if err := c.createContainerCmdFile(ctx, vmID, metadata); err != nil { - c.logger.WarnContext(ctx, "failed to create container.cmd file", - "error", err, - "vm_id", vmID, - ) - } - - if mappings, err := c.parseExposedPorts(ctx, vmID, metadata); err != nil { - c.logger.ErrorContext(ctx, "failed to parse exposed ports", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - // Continue without port mappings rather than failing the boot - } else { - portMappings = mappings - } - c.logger.LogAttrs(ctx, slog.LevelInfo, "loaded metadata for VM boot", - slog.String("vm_id", vmID), - slog.Int("port_count", len(portMappings)), - ) - break - } - } - } - - // Build firecracker config that will be used by SDK - fcConfig := c.buildFirecrackerConfig(ctx, vmID, vm.Config, vm.NetworkInfo, vm.AssetPaths) - fcConfig.SocketPath = socketPath - - // Update kernel args with metadata if available - if metadata != nil { - fcConfig.KernelArgs = c.buildKernelArgsWithMetadata(ctx, fcConfig.KernelArgs, metadata) - } - - // Create a context for this VM - vmCtx, cancel := context.WithCancel(context.Background()) - vm.CancelFunc = cancel - - // For integrated jailer, we use the SDK directly without external jailer - // The network namespace is already set up and tap device created - // We'll let the SDK manage firecracker but in our network namespace - - // Set the network namespace for the SDK to use - if vm.NetworkInfo != nil && vm.NetworkInfo.Namespace != "" { - fcConfig.NetNS = filepath.Join("/run/netns", vm.NetworkInfo.Namespace) - } - - // Create and start the machine using SDK - machine, err := sdk.NewMachine(vmCtx, fcConfig) - if err != nil { - cancel() - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "boot"), - attribute.String("error", "create_machine"), - )) - return fmt.Errorf("failed to create firecracker machine: %w", err) - } - - // Start the VM - if err := machine.Start(vmCtx); err != nil { - cancel() - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "boot"), - attribute.String("error", "start_machine"), - )) - return fmt.Errorf("failed to start firecracker machine: %w", err) - } - - vm.Machine = machine - vm.State = metaldv1.VmState_VM_STATE_RUNNING - vm.PortMappings = portMappings - - // AIDEV-NOTE: Persist port mappings to database for state recovery - if c.vmRepo != nil && len(portMappings) > 0 { - portMappingsJSON, err := json.Marshal(portMappings) - if err != nil { - c.logger.WarnContext(ctx, "failed to marshal port mappings for persistence", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - } else { - if err := c.vmRepo.UpdateVMPortMappingsWithContext(ctx, vmID, string(portMappingsJSON)); err != nil { - c.logger.WarnContext(ctx, "failed to persist port mappings to database", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - } else { - c.logger.InfoContext(ctx, "persisted port mappings to database", - slog.String("vm_id", vmID), - slog.Int("port_count", len(portMappings)), - ) - } - } - } - - // Acquire asset leases after successful boot - if vm.AssetMapping != nil && len(vm.AssetMapping.AssetIDs()) > 0 { - c.logger.LogAttrs(ctx, slog.LevelInfo, "acquiring asset leases for VM", - slog.String("vm_id", vmID), - slog.Int("asset_count", len(vm.AssetMapping.AssetIDs())), - ) - - leaseIDs := []string{} - for _, assetID := range vm.AssetMapping.AssetIDs() { - ctx, acquireSpan := c.tracer.Start(ctx, "metald.firecracker.acquire_asset", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("asset.id", assetID), - ), - ) - leaseID, err := c.assetClient.AcquireAsset(ctx, assetID, vmID) - if err != nil { - acquireSpan.RecordError(err) - acquireSpan.SetStatus(codes.Error, err.Error()) - } else { - acquireSpan.SetAttributes(attribute.String("lease.id", leaseID)) - } - acquireSpan.End() - if err != nil { - c.logger.ErrorContext(ctx, "failed to acquire asset lease", - "vm_id", vmID, - "asset_id", assetID, - "error", err, - ) - // Continue trying to acquire other leases even if one fails - // AIDEV-TODO: Consider whether to fail the boot if lease acquisition fails - } else { - leaseIDs = append(leaseIDs, leaseID) - } - } - - // Store lease IDs for cleanup during VM deletion - if len(leaseIDs) > 0 { - c.vmAssetLeases[vmID] = leaseIDs - c.logger.LogAttrs(ctx, slog.LevelInfo, "acquired asset leases", - slog.String("vm_id", vmID), - slog.Int("lease_count", len(leaseIDs)), - ) - } - } - - // Configure port forwarding if we have mappings - if vm.NetworkInfo != nil && len(vm.PortMappings) > 0 { - if err := c.configurePortForwarding(ctx, vmID, vm.NetworkInfo.IPAddress.String(), vm.PortMappings); err != nil { - c.logger.ErrorContext(ctx, "failed to configure port forwarding", - "vm_id", vmID, - "error", err, - ) - // Don't fail the VM boot, but log the error - } - } - - c.vmBootCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("status", "success"), - )) - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM booted successfully with SDK v4", - slog.String("vm_id", vmID), - ) - - return nil -} - -// Other methods would be similar to SDKClientV3... - -// buildFirecrackerConfig builds the SDK configuration without jailer -func (c *SDKClientV4) buildFirecrackerConfig(ctx context.Context, vmID string, config *metaldv1.VmConfig, networkInfo *network.VMNetwork, preparedPaths map[string]string) sdk.Config { - // For integrated jailer, we use absolute paths since we're not running inside chroot - // The assets are still in the jailer directory structure for consistency - jailerRoot := filepath.Join( - c.jailerConfig.ChrootBaseDir, - "firecracker", - vmID, - "root", - ) - - socketPath := "/firecracker.sock" - - // Determine kernel path - use prepared path if available, otherwise fallback to default - kernelPath := filepath.Join(jailerRoot, "vmlinux") - if preparedPaths != nil && len(preparedPaths) > 0 { - // AIDEV-NOTE: In a more sophisticated implementation, we'd track which asset ID - // corresponds to which component (kernel vs rootfs). For now, we rely on the - // assetmanager preparing files with standard names in the target directory. - // The prepared paths should already be in the jailerRoot directory. - c.logger.LogAttrs(ctx, slog.LevelDebug, "using prepared asset paths", - slog.String("vm_id", vmID), - slog.Int("path_count", len(preparedPaths)), - ) - } - - // Use host path since Firecracker is running outside chroot in "jailerless" mode - metricsPath := filepath.Join(jailerRoot, "metrics.fifo") - - // AIDEV-NOTE: Create metrics FIFO for billaged to read Firecracker stats - // billaged should read from: {jailerRoot}/metrics.fifo - // e.g., /srv/jailer/firecracker/{vmID}/root/metrics.fifo - hostMetricsPath := filepath.Join(jailerRoot, "metrics.fifo") - - // Create the metrics FIFO in the host filesystem - if err := unix.Mkfifo(hostMetricsPath, 0644); err != nil && !os.IsExist(err) { - c.logger.Error("failed to create metrics FIFO", - slog.String("vm_id", vmID), - slog.String("path", hostMetricsPath), - slog.String("error", err.Error()), - ) - } else { - c.logger.Info("created metrics FIFO for billaged", - slog.String("vm_id", vmID), - slog.String("host_path", hostMetricsPath), - slog.String("chroot_path", metricsPath), - ) - } - - // Use the kernel args as provided by the caller - // Metadata handling is now done in BootVM - kernelArgs := config.GetBoot().GetKernelArgs() - - // AIDEV-NOTE: Guest console logging configuration - // LogPath captures Firecracker's own logs, but LogFifo+FifoLogWriter captures guest OS console output - // This includes Linux kernel boot messages from console=ttyS0 kernel parameter - consoleLogPath := filepath.Join(jailerRoot, "console.log") - consoleFifoPath := filepath.Join(jailerRoot, "console.fifo") - - // Create the console log file to capture guest output - consoleLogFile, err := os.OpenFile(consoleLogPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) - - var cfg sdk.Config - if err != nil { - // Fall back to LogPath only (original behavior) if console log file creation fails - c.logger.WarnContext(ctx, "failed to create console log file, falling back to LogPath only", - slog.String("error", err.Error()), - slog.String("console_log_path", consoleLogPath), - ) - cfg = sdk.Config{ //nolint:exhaustruct // Optional fields are not needed for basic VM configuration - SocketPath: socketPath, - LogPath: consoleLogPath, // Original behavior - captures Firecracker logs only - LogLevel: "Debug", - MetricsPath: metricsPath, // Configure stats socket for billaged - KernelImagePath: kernelPath, - KernelArgs: kernelArgs, - MachineCfg: models.MachineConfiguration{ //nolint:exhaustruct // Only setting required fields for basic VM configuration - VcpuCount: sdk.Int64(int64(config.GetCpu().GetVcpuCount())), - MemSizeMib: sdk.Int64(config.GetMemory().GetSizeBytes() / (1024 * 1024)), - Smt: sdk.Bool(false), - }, - // No JailerCfg - we handle jailing ourselves - } - } else { - // Successful case - capture guest console output via FIFO - // Note: consoleLogFile will be closed when the VM shuts down via FifoLogWriter - cfg = sdk.Config{ //nolint:exhaustruct // Optional fields are not needed for basic VM configuration - SocketPath: socketPath, - LogPath: filepath.Join(jailerRoot, "firecracker.log"), // Firecracker's own logs - LogFifo: consoleFifoPath, // FIFO for guest console output - FifoLogWriter: consoleLogFile, // Writer to capture guest console to file - LogLevel: "Debug", - MetricsPath: metricsPath, // Configure stats socket for billaged - KernelImagePath: kernelPath, - KernelArgs: kernelArgs, - MachineCfg: models.MachineConfiguration{ //nolint:exhaustruct // Only setting required fields for basic VM configuration - VcpuCount: sdk.Int64(int64(config.GetCpu().GetVcpuCount())), - MemSizeMib: sdk.Int64(config.GetMemory().GetSizeBytes() / (1024 * 1024)), - Smt: sdk.Bool(false), - }, - // No JailerCfg - we handle jailing ourselves - } - } - - // Add drives - cfg.Drives = make([]models.Drive, 0, len(config.GetStorage())) - for i, disk := range config.GetStorage() { - driveID := disk.GetId() - if driveID == "" { - if disk.GetIsRootDevice() || i == 0 { - driveID = "rootfs" - } else { - driveID = fmt.Sprintf("drive_%d", i) - } - } - - // Use absolute paths for integrated jailer - // AIDEV-NOTE: Use standardized filename instead of the original config path - // to match what asset preparation creates (rootfs.ext4, not Docker-specific names) - diskFilename := filepath.Base(disk.GetPath()) - if disk.GetIsRootDevice() || i == 0 { - // For root devices, always use the standardized name that assetmanager creates - diskFilename = "rootfs.ext4" - } - - drive := models.Drive{ //nolint:exhaustruct // Only setting required drive fields - DriveID: &driveID, - PathOnHost: sdk.String(filepath.Join(jailerRoot, diskFilename)), - IsRootDevice: sdk.Bool(disk.GetIsRootDevice() || i == 0), - IsReadOnly: sdk.Bool(disk.GetReadOnly()), - } - cfg.Drives = append(cfg.Drives, drive) - } - - // Add network interface - if networkInfo != nil { - iface := sdk.NetworkInterface{ //nolint:exhaustruct // Only setting required network interface fields - StaticConfiguration: &sdk.StaticNetworkConfiguration{ //nolint:exhaustruct // Only setting required network configuration fields - HostDevName: networkInfo.TapDevice, - MacAddress: networkInfo.MacAddress, - }, - } - cfg.NetworkInterfaces = []sdk.NetworkInterface{iface} - } - - return cfg -} - -// assetRequirement represents a required asset for VM creation -type assetRequirement struct { - Type assetv1.AssetType - Labels map[string]string - Required bool -} - -// assetMapping tracks the mapping between requirements and actual assets -type assetMapping struct { - requirements []assetRequirement - assets map[string]*assetv1.Asset // requirement index -> asset - assetIDs []string - leaseIDs []string -} - -func (am *assetMapping) AssetIDs() []string { - return am.assetIDs -} - -func (am *assetMapping) LeaseIDs() []string { - return am.leaseIDs -} - -// buildAssetRequirements analyzes VM config to determine required assets -func (c *SDKClientV4) buildAssetRequirements(config *metaldv1.VmConfig) []assetRequirement { - var reqs []assetRequirement - - // DEBUG: Log VM config for docker image troubleshooting - c.logger.Info("DEBUG: analyzing VM config for assets", - "storage_count", len(config.Storage), - "metadata", config.Metadata, - ) - for i, disk := range config.Storage { - c.logger.Info("DEBUG: storage device", - "index", i, - "id", disk.Id, - "path", disk.Path, - "is_root", disk.IsRootDevice, - "options", disk.Options, - ) - } - - // Kernel requirement - if config.Boot != nil && config.Boot.KernelPath != "" { - reqs = append(reqs, assetRequirement{ - Type: assetv1.AssetType_ASSET_TYPE_KERNEL, - Required: true, - }) - } - - // Rootfs requirements from storage devices - for _, disk := range config.Storage { - if disk.IsRootDevice { - labels := make(map[string]string) - // Check for docker image in disk options first, then config metadata - if dockerImage, ok := disk.Options["docker_image"]; ok { - labels["docker_image"] = dockerImage - } else if dockerImage, ok := config.Metadata["docker_image"]; ok { - labels["docker_image"] = dockerImage - } - - // Note: force_rebuild is handled separately via BuildOptions, not asset labels - // We don't add force_rebuild to asset labels since it's a build trigger, not an asset attribute - reqs = append(reqs, assetRequirement{ - Type: assetv1.AssetType_ASSET_TYPE_ROOTFS, - Labels: labels, - Required: true, - }) - } - } - - // Initrd requirement (optional) - if config.Boot != nil && config.Boot.InitrdPath != "" { - reqs = append(reqs, assetRequirement{ - Type: assetv1.AssetType_ASSET_TYPE_INITRD, - Required: false, - }) - } - - return reqs -} - -// matchAssets matches available assets to requirements -func (c *SDKClientV4) matchAssets(reqs []assetRequirement, availableAssets []*assetv1.Asset) (*assetMapping, error) { - mapping := &assetMapping{ - requirements: reqs, - assets: make(map[string]*assetv1.Asset), - assetIDs: []string{}, - } - - for i, req := range reqs { - var matched *assetv1.Asset - - // Find best matching asset - for _, asset := range availableAssets { - if asset.Type != req.Type { - continue - } - - // Check if all required labels match - labelMatch := true - for k, v := range req.Labels { - if assetLabel, ok := asset.Labels[k]; !ok || assetLabel != v { - labelMatch = false - break - } - } - - if labelMatch { - matched = asset - break - } - } - - if matched == nil && req.Required { - // Build helpful error message - labelStr := "" - for k, v := range req.Labels { - if labelStr != "" { - labelStr += ", " - } - labelStr += fmt.Sprintf("%s=%s", k, v) - } - return nil, fmt.Errorf("no matching asset found for type %s with labels {%s}", - req.Type.String(), labelStr) - } - - if matched != nil { - mapping.assets[fmt.Sprintf("%d", i)] = matched - mapping.assetIDs = append(mapping.assetIDs, matched.Id) - } - } - - return mapping, nil -} - -// prepareVMAssets prepares kernel and rootfs assets for the VM in the jailer chroot -// Returns the asset mapping for lease acquisition after successful boot -func (c *SDKClientV4) prepareVMAssets(ctx context.Context, vmID string, config *metaldv1.VmConfig) (*assetMapping, map[string]string, error) { - // Calculate the jailer chroot path - jailerRoot := filepath.Join( - c.jailerConfig.ChrootBaseDir, - "firecracker", - vmID, - "root", - ) - - c.logger.LogAttrs(ctx, slog.LevelInfo, "preparing VM assets using assetmanager", - slog.String("vm_id", vmID), - slog.String("target_path", jailerRoot), - ) - - // Ensure the jailer root directory exists - if err := os.MkdirAll(jailerRoot, 0755); err != nil { - return nil, nil, fmt.Errorf("failed to create jailer root directory: %w", err) - } - - // Check if assetmanager is enabled - // If disabled (using noop client), fall back to static file copying for backward compatibility - // AIDEV-NOTE: We check if the QueryAssets call succeeds to determine if assetmanager is available - // We don't require assets to exist, as they can be built on demand - ctx, checkSpan := c.tracer.Start(ctx, "metald.firecracker.check_assetmanager", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("asset.type", "KERNEL"), - ), - ) - _, err := c.assetClient.QueryAssets(ctx, assetv1.AssetType_ASSET_TYPE_KERNEL, nil, nil) - checkSpan.End() - if err != nil { - c.logger.LogAttrs(ctx, slog.LevelInfo, "assetmanager disabled or unavailable, using static file copying", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - // AIDEV-NOTE: Fallback to old behavior when assetmanager is disabled - // This ensures backward compatibility - if err := c.prepareVMAssetsStatic(ctx, vmID, config, jailerRoot); err != nil { - return nil, nil, err - } - return nil, nil, nil - } - - // Build asset requirements from VM configuration - requiredAssets := c.buildAssetRequirements(config) - c.logger.LogAttrs(ctx, slog.LevelDebug, "determined asset requirements", - slog.String("vm_id", vmID), - slog.Int("required_count", len(requiredAssets)), - ) - - // Query assetmanager for available assets with automatic build support - // AIDEV-NOTE: Using QueryAssets instead of ListAssets to enable automatic asset creation - allAssets := []*assetv1.Asset{} - - // Extract tenant_id from VM metadata if available, with fallback to default - tenantID := "cli-tenant" // AIDEV-NOTE: Default tenant for CLI operations - if tid, ok := config.Metadata["tenant_id"]; ok { - tenantID = tid - } - - // Group requirements by type and labels for efficient querying - type queryKey struct { - assetType assetv1.AssetType - labels string // Serialized labels for grouping - } - queryGroups := make(map[queryKey][]assetRequirement) - - for _, req := range requiredAssets { - // Serialize labels for grouping - labelStr := "" - for k, v := range req.Labels { - if labelStr != "" { - labelStr += "," - } - labelStr += fmt.Sprintf("%s=%s", k, v) - } - key := queryKey{assetType: req.Type, labels: labelStr} - queryGroups[key] = append(queryGroups[key], req) - } - - // Query each unique combination of type and labels - for key, reqs := range queryGroups { - // Use the first requirement's labels (they should all be the same in the group) - labels := reqs[0].Labels - - // Generate a deterministic asset ID based on the asset type and labels - // This allows us to query for the exact asset later - assetID := c.generateAssetID(key.assetType, labels) - - c.logger.LogAttrs(ctx, slog.LevelInfo, "generated asset ID for query", - slog.String("asset_id", assetID), - slog.String("asset_type", key.assetType.String()), - slog.Any("labels", labels), - ) - - // Configure build options for automatic asset creation - // AIDEV-NOTE: When WaitForCompletion is true, VM creation will block until the build - // completes. This provides a synchronous experience where the VM is ready to boot - // immediately after creation, but may cause longer wait times (up to 30 minutes - // for large images). The client timeout should be configured accordingly. - - // Create build labels (copy asset labels and add force_rebuild if needed) - buildLabels := make(map[string]string) - for k, v := range labels { - buildLabels[k] = v - } - - // Check for force_rebuild in VM config metadata (separate from asset labels) - if forceRebuild, ok := config.Metadata["force_rebuild"]; ok && forceRebuild == "true" { - buildLabels["force_rebuild"] = "true" - } - - buildOptions := &assetv1.BuildOptions{ - EnableAutoBuild: true, - WaitForCompletion: true, // Block VM creation until build completes - BuildTimeoutSeconds: 1800, // 30 minutes maximum wait time - TenantId: tenantID, - SuggestedAssetId: assetID, - BuildLabels: buildLabels, // Pass build labels including force_rebuild to assetmanagerd - } - - // Query assets with automatic build support - // Create a quick span just to record that we're initiating a query - _, initSpan := c.tracer.Start(ctx, "metald.firecracker.query_assets", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("asset.type", key.assetType.String()), - attribute.StringSlice("asset.labels", func() []string { - var labelPairs []string - for k, v := range labels { - labelPairs = append(labelPairs, fmt.Sprintf("%s=%s", k, v)) - } - return labelPairs - }()), - attribute.String("tenant.id", tenantID), - attribute.Bool("auto_build.enabled", buildOptions.EnableAutoBuild), - attribute.Int("build.timeout_seconds", int(buildOptions.BuildTimeoutSeconds)), - ), - ) - initSpan.End() // End immediately - this just marks the initiation - - // Make the actual call without wrapping in a span (it has its own internal spans) - resp, err := c.assetClient.QueryAssets(ctx, key.assetType, labels, buildOptions) - if err != nil { - return nil, nil, fmt.Errorf("failed to query assets of type %s with labels %v: %w", - key.assetType.String(), labels, err) - } - - // Create a quick span to record the results - _, resultSpan := c.tracer.Start(ctx, "metald.firecracker.query_assets_complete", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("asset.type", key.assetType.String()), - attribute.Int("assets.found", len(resp.GetAssets())), - attribute.Int("builds.triggered", len(resp.GetTriggeredBuilds())), - ), - ) - resultSpan.End() - - // Log any triggered builds - for _, build := range resp.GetTriggeredBuilds() { - c.logger.LogAttrs(ctx, slog.LevelInfo, "automatic build triggered for missing asset", - slog.String("vm_id", vmID), - slog.String("build_id", build.GetBuildId()), - slog.String("docker_image", build.GetDockerImage()), - slog.String("status", build.GetStatus()), - ) - - if build.GetStatus() == "failed" { - c.logger.LogAttrs(ctx, slog.LevelError, "automatic build failed", - slog.String("vm_id", vmID), - slog.String("build_id", build.GetBuildId()), - slog.String("error", build.GetErrorMessage()), - ) - } - } - - allAssets = append(allAssets, resp.GetAssets()...) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "retrieved available assets", - slog.String("vm_id", vmID), - slog.Int("available_count", len(allAssets)), - ) - - // Log asset details for debugging - for _, asset := range allAssets { - c.logger.LogAttrs(ctx, slog.LevelInfo, "available asset", - slog.String("asset_id", asset.Id), - slog.String("asset_type", asset.Type.String()), - slog.Any("labels", asset.Labels), - ) - } - - // Match required assets with available ones - assetMapping, err := c.matchAssets(requiredAssets, allAssets) - if err != nil { - c.logger.LogAttrs(ctx, slog.LevelError, "failed to match assets", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return nil, nil, fmt.Errorf("asset matching failed: %w", err) - } - - // Prepare assets in target location - ctx, prepareSpan := c.tracer.Start(ctx, "metald.firecracker.prepare_assets", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.StringSlice("asset.ids", assetMapping.AssetIDs()), - attribute.String("target.path", jailerRoot), - ), - ) - preparedPaths, err := c.assetClient.PrepareAssets( - ctx, - assetMapping.AssetIDs(), - jailerRoot, - vmID, - ) - if err != nil { - prepareSpan.RecordError(err) - prepareSpan.SetStatus(codes.Error, err.Error()) - } else { - prepareSpan.SetAttributes( - attribute.Int("assets.prepared", len(preparedPaths)), - ) - } - prepareSpan.End() - if err != nil { - return nil, nil, fmt.Errorf("failed to prepare assets: %w", err) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "assets prepared successfully", - slog.String("vm_id", vmID), - slog.Int("asset_count", len(preparedPaths)), - ) - - // The preparedPaths map contains asset_id -> actual_path mappings - // These paths will be used to update the VM configuration before starting - // Asset leases will be acquired after successful VM boot in BootVM - // to avoid holding leases for VMs that fail to start - - // AIDEV-NOTE: Copy metadata files alongside rootfs assets if they exist - // Asset manager only handles the rootfs, but we need metadata for container execution - if err := c.copyMetadataFilesForAssets(ctx, vmID, config, preparedPaths, jailerRoot); err != nil { - c.logger.WarnContext(ctx, "failed to copy metadata files", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - // Don't fail asset preparation for metadata issues - VM can still run without metadata - } - - return assetMapping, preparedPaths, nil -} - -// prepareVMAssetsStatic is the fallback implementation for static file copying -// Used when assetmanager is disabled for backward compatibility -func (c *SDKClientV4) prepareVMAssetsStatic(ctx context.Context, vmID string, config *metaldv1.VmConfig, jailerRoot string) error { - // Copy kernel - if kernelPath := config.GetBoot().GetKernelPath(); kernelPath != "" { - kernelDst := filepath.Join(jailerRoot, "vmlinux") - if err := copyFileWithOwnership(kernelPath, kernelDst, int(c.jailerConfig.UID), int(c.jailerConfig.GID)); err != nil { - return fmt.Errorf("failed to copy kernel: %w", err) - } - c.logger.LogAttrs(ctx, slog.LevelInfo, "copied kernel to jailer root", - slog.String("src", kernelPath), - slog.String("dst", kernelDst), - ) - } - - // Copy rootfs images - for _, disk := range config.GetStorage() { - if disk.GetPath() != "" { - diskDst := filepath.Join(jailerRoot, filepath.Base(disk.GetPath())) - if err := copyFileWithOwnership(disk.GetPath(), diskDst, int(c.jailerConfig.UID), int(c.jailerConfig.GID)); err != nil { - return fmt.Errorf("failed to copy disk %s: %w", disk.GetPath(), err) - } - c.logger.LogAttrs(ctx, slog.LevelInfo, "copied disk to jailer root", - slog.String("src", disk.GetPath()), - slog.String("dst", diskDst), - ) - - // Also copy metadata file if it exists - if disk.GetIsRootDevice() { - baseName := strings.TrimSuffix(filepath.Base(disk.GetPath()), filepath.Ext(disk.GetPath())) - metadataSrc := filepath.Join(filepath.Dir(disk.GetPath()), baseName+".metadata.json") - if _, err := os.Stat(metadataSrc); err == nil { - metadataDst := filepath.Join(jailerRoot, filepath.Base(metadataSrc)) - if err := copyFileWithOwnership(metadataSrc, metadataDst, int(c.jailerConfig.UID), int(c.jailerConfig.GID)); err != nil { - c.logger.WarnContext(ctx, "failed to copy metadata file", - "src", metadataSrc, - "dst", metadataDst, - "error", err, - ) - } else { - c.logger.LogAttrs(ctx, slog.LevelInfo, "copied metadata file to jailer root", - slog.String("src", metadataSrc), - slog.String("dst", metadataDst), - ) - - // Write command file to rootfs by mounting it temporarily - // This avoids kernel command line parsing issues - metadata, err := c.loadContainerMetadata(ctx, disk.GetPath()) - if err == nil && metadata != nil { - // Build the command array - var fullCmd []string - fullCmd = append(fullCmd, metadata.Entrypoint...) - fullCmd = append(fullCmd, metadata.Command...) - - if len(fullCmd) > 0 { - // Mount the rootfs temporarily to write the command file - mountDir := filepath.Join("/tmp", fmt.Sprintf("mount-%s", vmID)) - if err := os.MkdirAll(mountDir, 0755); err == nil { - // Mount the rootfs ext4 image - mountCmd := exec.CommandContext(ctx, "mount", "-o", "loop", diskDst, mountDir) - if err := mountCmd.Run(); err != nil { - c.logger.WarnContext(ctx, "failed to mount rootfs for command file", - "error", err, - "disk", diskDst, - ) - } else { - // Write the command file - cmdFile := filepath.Join(mountDir, "container.cmd") - cmdData, _ := json.Marshal(fullCmd) - if err := os.WriteFile(cmdFile, cmdData, 0644); err != nil { - c.logger.WarnContext(ctx, "failed to write command file", - "path", cmdFile, - "error", err, - ) - } else { - c.logger.LogAttrs(ctx, slog.LevelInfo, "wrote container command file to rootfs", - slog.String("path", cmdFile), - slog.String("command", string(cmdData)), - ) - } - - // Unmount - umountCmd := exec.CommandContext(ctx, "umount", mountDir) - if err := umountCmd.Run(); err != nil { - c.logger.WarnContext(ctx, "failed to unmount rootfs", - "error", err, - "mountDir", mountDir, - ) - } - os.RemoveAll(mountDir) - } - } - } - } - } - } - } - } - } - - return nil -} - -// DeleteVM deletes a VM and cleans up its resources -func (c *SDKClientV4) DeleteVM(ctx context.Context, vmID string) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.delete_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - c.logger.LogAttrs(ctx, slog.LevelInfo, "deleting VM", - slog.String("vm_id", vmID), - ) - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - c.vmErrorCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "delete"), - attribute.String("error", "vm_not_found"), - )) - return err - } - - // Stop the VM if it's running - if vm.Machine != nil { - if err := vm.Machine.StopVMM(); err != nil { - c.logger.WarnContext(ctx, "failed to stop VMM during delete", - "vm_id", vmID, - "error", err, - ) - } - - // Cancel the VM context - if vm.CancelFunc != nil { - vm.CancelFunc() - } - } - - // Remove port forwarding rules before deleting network - if vm.NetworkInfo != nil && len(vm.PortMappings) > 0 { - if err := c.removePortForwarding(ctx, vmID, vm.NetworkInfo.IPAddress.String(), vm.PortMappings); err != nil { - c.logger.WarnContext(ctx, "failed to remove port forwarding", - "vm_id", vmID, - "error", err, - ) - } - - // Release allocated ports in network manager - releasedMappings := c.networkManager.ReleaseVMPorts(vmID) - c.logger.InfoContext(ctx, "released VM port allocations", - slog.String("vm_id", vmID), - slog.Int("port_count", len(releasedMappings)), - ) - } - - // Delete network resources - if err := c.networkManager.DeleteVMNetwork(ctx, vmID); err != nil { - c.logger.ErrorContext(ctx, "failed to delete VM network", - "vm_id", vmID, - "error", err, - ) - // Continue with deletion even if network cleanup fails - } - - // Clean up VM directory - vmDir := filepath.Join(c.baseDir, vmID) - if err := os.RemoveAll(vmDir); err != nil { - c.logger.WarnContext(ctx, "failed to remove VM directory", - "vm_id", vmID, - "path", vmDir, - "error", err, - ) - } - - // Clean up jailer chroot - chrootPath := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID) - if err := os.RemoveAll(chrootPath); err != nil { - c.logger.WarnContext(ctx, "failed to remove jailer chroot", - "vm_id", vmID, - "path", chrootPath, - "error", err, - ) - } - - // Release asset leases - if leaseIDs, ok := c.vmAssetLeases[vmID]; ok { - c.logger.LogAttrs(ctx, slog.LevelInfo, "releasing asset leases", - slog.String("vm_id", vmID), - slog.Int("lease_count", len(leaseIDs)), - ) - - for _, leaseID := range leaseIDs { - ctx, releaseSpan := c.tracer.Start(ctx, "metald.firecracker.release_asset", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("lease.id", leaseID), - ), - ) - err := c.assetClient.ReleaseAsset(ctx, leaseID) - if err != nil { - releaseSpan.RecordError(err) - releaseSpan.SetStatus(codes.Error, err.Error()) - } - releaseSpan.End() - if err != nil { - c.logger.ErrorContext(ctx, "failed to release asset lease", - "vm_id", vmID, - "lease_id", leaseID, - "error", err, - ) - // Continue with other leases even if one fails - } - } - delete(c.vmAssetLeases, vmID) - } - - // Remove from registry - delete(c.vmRegistry, vmID) - - c.vmDeleteCounter.Add(ctx, 1, metric.WithAttributes( - attribute.String("status", "success"), - )) - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM deleted successfully", - slog.String("vm_id", vmID), - ) - - return nil -} - -// ShutdownVM gracefully shuts down a VM -func (c *SDKClientV4) ShutdownVM(ctx context.Context, vmID string) error { - return c.ShutdownVMWithOptions(ctx, vmID, false, 30) -} - -// ShutdownVMWithOptions shuts down a VM with configurable options -func (c *SDKClientV4) ShutdownVMWithOptions(ctx context.Context, vmID string, force bool, timeoutSeconds int32) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.shutdown_vm", - trace.WithAttributes( - attribute.String("vm_id", vmID), - attribute.Bool("force", force), - attribute.Int("timeout_seconds", int(timeoutSeconds)), - ), - ) - defer span.End() - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return err - } - - if vm.Machine == nil { - return fmt.Errorf("vm %s is not running", vmID) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "shutting down VM", - slog.String("vm_id", vmID), - slog.Bool("force", force), - slog.Int("timeout_seconds", int(timeoutSeconds)), - ) - - // Create a timeout context - shutdownCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSeconds)*time.Second) - defer cancel() - - if force { //nolint:nestif // Complex shutdown logic requires nested conditions for force vs graceful shutdown - // Force shutdown by stopping the VMM immediately - if err := vm.Machine.StopVMM(); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to force stop VM: %w", err) - } - } else { - // Try graceful shutdown first - if err := vm.Machine.Shutdown(shutdownCtx); err != nil { - c.logger.WarnContext(ctx, "graceful shutdown failed, attempting force stop", - "vm_id", vmID, - "error", err, - ) - // Fall back to force stop - if stopErr := vm.Machine.StopVMM(); stopErr != nil { - span.RecordError(stopErr) - return fmt.Errorf("failed to stop VM after graceful shutdown failed: %w", stopErr) - } - } - } - - // Wait for the VM to actually stop - if err := vm.Machine.Wait(shutdownCtx); err != nil { - c.logger.WarnContext(ctx, "error waiting for VM to stop", - "vm_id", vmID, - "error", err, - ) - } - - // Update state - vm.State = metaldv1.VmState_VM_STATE_SHUTDOWN - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM shutdown successfully", - slog.String("vm_id", vmID), - ) - - return nil -} - -// PauseVM pauses a running VM -func (c *SDKClientV4) PauseVM(ctx context.Context, vmID string) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.pause_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return err - } - - if vm.Machine == nil { - return fmt.Errorf("vm %s is not running", vmID) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "pausing VM", - slog.String("vm_id", vmID), - ) - - if err := vm.Machine.PauseVM(ctx); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to pause VM: %w", err) - } - - vm.State = metaldv1.VmState_VM_STATE_PAUSED - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM paused successfully", - slog.String("vm_id", vmID), - ) - - return nil -} - -// ResumeVM resumes a paused VM -func (c *SDKClientV4) ResumeVM(ctx context.Context, vmID string) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.resume_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return err - } - - if vm.Machine == nil { - return fmt.Errorf("vm %s is not running", vmID) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "resuming VM", - slog.String("vm_id", vmID), - ) - - if err := vm.Machine.ResumeVM(ctx); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to resume VM: %w", err) - } - - vm.State = metaldv1.VmState_VM_STATE_RUNNING - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM resumed successfully", - slog.String("vm_id", vmID), - ) - - return nil -} - -// RebootVM reboots a running VM -func (c *SDKClientV4) RebootVM(ctx context.Context, vmID string) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.reboot_vm", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - c.logger.LogAttrs(ctx, slog.LevelInfo, "rebooting VM", - slog.String("vm_id", vmID), - ) - - // Shutdown the VM - if err := c.ShutdownVMWithOptions(ctx, vmID, false, 30); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to shutdown VM for reboot: %w", err) - } - - // Wait a moment - time.Sleep(1 * time.Second) - - // Boot the VM again - if err := c.BootVM(ctx, vmID); err != nil { - span.RecordError(err) - return fmt.Errorf("failed to boot VM after shutdown: %w", err) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "VM rebooted successfully", - slog.String("vm_id", vmID), - ) - - return nil -} - -// generateAssetID generates a deterministic asset ID based on type and labels -func (c *SDKClientV4) generateAssetID(assetType assetv1.AssetType, labels map[string]string) string { - // Create a deterministic string from sorted labels - var parts []string - parts = append(parts, fmt.Sprintf("type=%s", assetType.String())) - - // Sort label keys for deterministic ordering - var keys []string - for k := range labels { - keys = append(keys, k) - } - sort.Strings(keys) - - // Add sorted labels - for _, k := range keys { - parts = append(parts, fmt.Sprintf("%s=%s", k, labels[k])) - } - - // Create a hash of the combined string - combined := strings.Join(parts, ",") - hash := sha256.Sum256([]byte(combined)) - - // Return a readable asset ID - return fmt.Sprintf("asset-%x", hash[:8]) -} - -// GetVMInfo returns information about a VM -// AIDEV-NOTE: GetVMInfo now includes port mappings in the NetworkInfo response -// Port mappings are retrieved from the network manager and converted to protobuf format -// This allows CLI clients to display randomly assigned host ports for VM services -func (c *SDKClientV4) GetVMInfo(ctx context.Context, vmID string) (*types.VMInfo, error) { - _, span := c.tracer.Start(ctx, "metald.firecracker.get_vm_info", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return nil, err - } - - info := &types.VMInfo{ //nolint:exhaustruct // NetworkInfo is populated conditionally below - Config: vm.Config, - State: vm.State, - } - - // Add network info if available - if vm.NetworkInfo != nil { - // Get port mappings for this VM - portMappings := c.networkManager.GetVMPorts(vmID) - - // Convert network.PortMapping to protobuf PortMapping - var protoPortMappings []*metaldv1.PortMapping - for _, mapping := range portMappings { - protoPortMappings = append(protoPortMappings, &metaldv1.PortMapping{ - ContainerPort: int32(mapping.ContainerPort), //nolint:gosec // ports are within valid range - HostPort: int32(mapping.HostPort), //nolint:gosec // ports are within valid range - Protocol: mapping.Protocol, - }) - } - - info.NetworkInfo = &metaldv1.VmNetworkInfo{ //nolint:exhaustruct // Optional fields are not needed for basic network info - IpAddress: vm.NetworkInfo.IPAddress.String(), - MacAddress: vm.NetworkInfo.MacAddress, - TapDevice: vm.NetworkInfo.TapDevice, - PortMappings: protoPortMappings, - } - } - - return info, nil -} - -// GetVMMetrics returns metrics for a VM -func (c *SDKClientV4) GetVMMetrics(ctx context.Context, vmID string) (*types.VMMetrics, error) { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.get_vm_metrics", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - vm, exists := c.vmRegistry[vmID] - if !exists { - err := fmt.Errorf("vm %s not found", vmID) - span.RecordError(err) - return nil, err - } - - if vm.Machine == nil { - return nil, fmt.Errorf("vm %s is not running", vmID) - } - - // Read real metrics from Firecracker stats FIFO - return c.readFirecrackerMetrics(ctx, vmID) -} - -// FirecrackerMetrics represents the JSON structure from Firecracker stats -type FirecrackerMetrics struct { - VCPU []struct { - ExitReasons map[string]int64 `json:"exit_reasons"` - } `json:"vcpu"` - Block []struct { - ReadBytes int64 `json:"read_bytes"` - WriteBytes int64 `json:"write_bytes"` - ReadCount int64 `json:"read_count"` - WriteCount int64 `json:"write_count"` - } `json:"block"` - Net []struct { - RxBytes int64 `json:"rx_bytes"` - TxBytes int64 `json:"tx_bytes"` - RxPackets int64 `json:"rx_packets"` - TxPackets int64 `json:"tx_packets"` - } `json:"net"` - // Note: CPU time and memory usage may be in other fields or require calculation -} - -// readFirecrackerMetrics reads metrics from the Firecracker stats FIFO -func (c *SDKClientV4) readFirecrackerMetrics(ctx context.Context, vmID string) (*types.VMMetrics, error) { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.read_metrics", - trace.WithAttributes(attribute.String("vm_id", vmID)), - ) - defer span.End() - - // Construct FIFO path - fifoPath := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID, "root", "metrics.fifo") - - // Try to read from FIFO (with timeout for blocking read) - file, err := os.OpenFile(fifoPath, os.O_RDONLY, 0) - if err != nil { - // If FIFO doesn't exist or can't be opened, return zeros (VM might be starting) - c.logger.WarnContext(ctx, "cannot read metrics FIFO", - slog.String("vm_id", vmID), - slog.String("fifo_path", fifoPath), - slog.String("error", err.Error()), - ) - return &types.VMMetrics{ - Timestamp: time.Now(), - CpuTimeNanos: 0, - MemoryUsageBytes: 0, - DiskReadBytes: 0, - DiskWriteBytes: 0, - NetworkRxBytes: 0, - NetworkTxBytes: 0, - }, nil - } - defer file.Close() - - // AIDEV-NOTE: Firecracker writes a continuous JSON stream to the FIFO - // We need to use a JSON decoder to handle streaming JSON objects properly - type result struct { - metrics *FirecrackerMetrics - err error - } - resultCh := make(chan result, 1) - - go func() { - decoder := json.NewDecoder(file) - var fcMetrics FirecrackerMetrics - - // AIDEV-NOTE: Firecracker writes periodic JSON objects to the FIFO - // We might start reading in the middle of a JSON object, so we need to - // keep trying until we get a complete, valid JSON object - maxAttempts := 5 - for attempt := 0; attempt < maxAttempts; attempt++ { - if err := decoder.Decode(&fcMetrics); err != nil { - // If we get a JSON syntax error, it might be because we started - // reading in the middle of an object. Try again. - if attempt < maxAttempts-1 { - continue - } - resultCh <- result{metrics: nil, err: err} - return - } - - // Successfully decoded a complete JSON object - resultCh <- result{metrics: &fcMetrics, err: nil} - return - } - }() - - var fcMetrics *FirecrackerMetrics - select { - case res := <-resultCh: - if res.err != nil { - c.logger.WarnContext(ctx, "failed to read JSON from metrics FIFO", - slog.String("vm_id", vmID), - slog.String("error", res.err.Error()), - ) - // Return zeros on read error - VM might still be starting up - return &types.VMMetrics{ - Timestamp: time.Now(), - CpuTimeNanos: 0, - MemoryUsageBytes: 0, - DiskReadBytes: 0, - DiskWriteBytes: 0, - NetworkRxBytes: 0, - NetworkTxBytes: 0, - }, nil - } - fcMetrics = res.metrics - - case <-time.After(2 * time.Second): - // Timeout - no metrics available within timeout - c.logger.DebugContext(ctx, "timeout reading metrics FIFO", - slog.String("vm_id", vmID), - ) - return &types.VMMetrics{ - Timestamp: time.Now(), - CpuTimeNanos: 0, - MemoryUsageBytes: 0, - DiskReadBytes: 0, - DiskWriteBytes: 0, - NetworkRxBytes: 0, - NetworkTxBytes: 0, - }, nil - } - - // Convert to our internal format - metrics := &types.VMMetrics{ - Timestamp: time.Now(), - CpuTimeNanos: 0, // TODO: Calculate from VCPU exit reasons or other fields - MemoryUsageBytes: 0, // TODO: Extract from memory metrics if available - DiskReadBytes: 0, - DiskWriteBytes: 0, - NetworkRxBytes: 0, - NetworkTxBytes: 0, - } - - // Aggregate disk metrics from all block devices - for _, block := range fcMetrics.Block { - metrics.DiskReadBytes += block.ReadBytes - metrics.DiskWriteBytes += block.WriteBytes - } - - // Aggregate network metrics from all network interfaces - for _, net := range fcMetrics.Net { - metrics.NetworkRxBytes += net.RxBytes - metrics.NetworkTxBytes += net.TxBytes - } - - c.logger.DebugContext(ctx, "read Firecracker metrics", - slog.String("vm_id", vmID), - slog.Int64("disk_read_bytes", metrics.DiskReadBytes), - slog.Int64("disk_write_bytes", metrics.DiskWriteBytes), - slog.Int64("network_rx_bytes", metrics.NetworkRxBytes), - slog.Int64("network_tx_bytes", metrics.NetworkTxBytes), - ) - - return metrics, nil -} - -func (c *SDKClientV4) Ping(ctx context.Context) error { - c.logger.DebugContext(ctx, "pinging firecracker SDK v4 backend") - return nil -} - -func (c *SDKClientV4) Shutdown(ctx context.Context) error { - ctx, span := c.tracer.Start(ctx, "metald.firecracker.shutdown") - defer span.End() - - c.logger.InfoContext(ctx, "shutting down SDK v4 backend") - - // Shutdown all running VMs - for vmID, vm := range c.vmRegistry { - c.logger.InfoContext(ctx, "shutting down VM during backend shutdown", - "vm_id", vmID, - ) - if vm.Machine != nil { - if err := vm.Machine.StopVMM(); err != nil { - c.logger.ErrorContext(ctx, "failed to stop VM during shutdown", - "vm_id", vmID, - "error", err, - ) - } - if vm.CancelFunc != nil { - vm.CancelFunc() - } - } - } - - c.logger.InfoContext(ctx, "SDK v4 backend shutdown complete") - return nil -} - -// Ensure SDKClientV4 implements Backend interface -var _ types.Backend = (*SDKClientV4)(nil) - -// generateV4VMID generates a unique VM ID for V4 client -func generateV4VMID() (string, error) { - // Generate a random ID - bytes := make([]byte, 8) - if _, err := rand.Read(bytes); err != nil { - return "", fmt.Errorf("failed to generate random ID: %w", err) - } - return fmt.Sprintf("ud-%s", hex.EncodeToString(bytes)), nil -} - -// Helper function to copy files with ownership -func copyFileWithOwnership(src, dst string, uid, gid int) error { - // Use cp command to handle large files efficiently - cmd := exec.Command("cp", "-f", src, dst) - if output, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("cp command failed: %w, output: %s", err, output) - } - - // Set permissions - if err := os.Chmod(dst, 0644); err != nil { - return fmt.Errorf("failed to set permissions on %s: %w", dst, err) - } - - // Set ownership - if err := os.Chown(dst, uid, gid); err != nil { - // Log but don't fail - might work anyway - return nil - } - - return nil -} - -// AIDEV-NOTE: This implementation integrates jailer functionality directly into metald -// Key advantages: -// 1. Network setup happens BEFORE dropping privileges -// 2. Tap devices are created with full capabilities -// 3. We maintain security isolation via chroot and privilege dropping -// 4. No external jailer binary needed - everything is integrated - -// loadContainerMetadata loads container metadata from the metadata file if it exists -func (c *SDKClientV4) loadContainerMetadata(ctx context.Context, rootfsPath string) (*builderv1.ImageMetadata, error) { - // AIDEV-NOTE: Load container metadata saved by builderd - // The metadata file is named {buildID}.metadata.json and should be alongside the rootfs - - // Extract base name without extension - baseName := strings.TrimSuffix(filepath.Base(rootfsPath), filepath.Ext(rootfsPath)) - metadataPath := filepath.Join(filepath.Dir(rootfsPath), baseName+".metadata.json") - - c.logger.LogAttrs(ctx, slog.LevelInfo, "AIDEV-DEBUG: looking for container metadata", - slog.String("rootfs_path", rootfsPath), - slog.String("metadata_path", metadataPath), - ) - - // Check if metadata file exists - if _, err := os.Stat(metadataPath); os.IsNotExist(err) { - // AIDEV-NOTE: Fallback to check for metadata.json in VM chroot directory - // When assets are copied to VM chroot by assetmanagerd, metadata file is renamed to metadata.json - fallbackPath := filepath.Join(filepath.Dir(rootfsPath), "metadata.json") - if _, err := os.Stat(fallbackPath); os.IsNotExist(err) { - c.logger.LogAttrs(ctx, slog.LevelDebug, "no metadata file found in either location", - slog.String("primary_path", metadataPath), - slog.String("fallback_path", fallbackPath), - ) - return nil, nil // No metadata is not an error - } - // Use fallback path - metadataPath = fallbackPath - c.logger.LogAttrs(ctx, slog.LevelInfo, "AIDEV-DEBUG: using fallback metadata path", - slog.String("fallback_path", fallbackPath), - ) - } - - // Read metadata file - data, err := os.ReadFile(metadataPath) - if err != nil { - return nil, fmt.Errorf("failed to read metadata file: %w", err) - } - - // Parse metadata - var metadata builderv1.ImageMetadata - if err := json.Unmarshal(data, &metadata); err != nil { - return nil, fmt.Errorf("failed to parse metadata: %w", err) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "loaded container metadata", - slog.String("image", metadata.OriginalImage), - slog.Int("entrypoint_len", len(metadata.Entrypoint)), - slog.Int("cmd_len", len(metadata.Command)), - slog.Int("env_vars", len(metadata.Env)), - slog.Int("exposed_ports", len(metadata.ExposedPorts)), - ) - - return &metadata, nil -} - -// buildKernelArgsWithMetadata builds kernel arguments incorporating container metadata -func (c *SDKClientV4) buildKernelArgsWithMetadata(ctx context.Context, baseArgs string, metadata *builderv1.ImageMetadata) string { - // AIDEV-NOTE: Build kernel args that will execute the container's entrypoint/cmd - - // Parse existing kernel args to preserve important ones - var kernelParams []string - var hasInit bool - - if baseArgs != "" { - // Split base args and check for existing init - parts := strings.Fields(baseArgs) - for _, part := range parts { - if strings.HasPrefix(part, "init=") { - hasInit = true - } - // Keep important kernel parameters - if strings.HasPrefix(part, "console=") || - strings.HasPrefix(part, "reboot=") || - strings.HasPrefix(part, "panic=") || - strings.HasPrefix(part, "pci=") || - strings.HasPrefix(part, "i8042.") { - kernelParams = append(kernelParams, part) - } - } - } - - // Add default kernel params if not present - if len(kernelParams) == 0 { - kernelParams = []string{ - "console=ttyS0,115200", - "reboot=k", - "panic=1", - "pci=off", - "i8042.noaux", - "i8042.nomux", - "i8042.nopnp", - "i8042.dumbkbd", - "root=/dev/vda", - "rw", - } - } - - // AIDEV-NOTE: Always add verbose logging for debugging - // Check if we already have these parameters to avoid duplicates - hasEarlyPrintk := false - hasLogLevel := false - for _, param := range kernelParams { - if strings.HasPrefix(param, "earlyprintk=") { - hasEarlyPrintk = true - } - if strings.HasPrefix(param, "loglevel=") { - hasLogLevel = true - } - } - if !hasEarlyPrintk { - kernelParams = append(kernelParams, "earlyprintk=serial,ttyS0,115200") - } - if !hasLogLevel { - kernelParams = append(kernelParams, "loglevel=8") - } - - // AIDEV-NOTE: Add aggressive debugging parameters - kernelParams = append(kernelParams, "debug") - kernelParams = append(kernelParams, "ignore_loglevel") - kernelParams = append(kernelParams, "printk.devkmsg=on") - - // If we have metadata and no init specified, use metald-init - if metadata != nil && !hasInit { - // Add environment variables as kernel parameters - // Format: env.KEY=VALUE - for key, value := range metadata.Env { - // Skip potentially problematic env vars - if key == "PATH" || strings.Contains(key, " ") || strings.Contains(value, " ") { - continue - } - kernelParams = append(kernelParams, fmt.Sprintf("env.%s=%s", key, value)) - } - - // Add working directory if specified - if metadata.WorkingDir != "" { - kernelParams = append(kernelParams, fmt.Sprintf("workdir=%s", metadata.WorkingDir)) - } - - // Use metald-init as the init process wrapper - kernelParams = append(kernelParams, "init=/usr/bin/metald-init") - - // Build the final kernel args string - args := strings.Join(kernelParams, " ") - - // Don't pass command on kernel command line - metald-init will read from /container.cmd - // This avoids all the kernel command line parsing issues with spaces and special characters - c.logger.LogAttrs(ctx, slog.LevelInfo, "built kernel args with container metadata", - slog.String("init", "/usr/bin/metald-init"), - slog.String("final_args", args), - ) - - return args - } - - // No metadata or init already specified, return base args - return baseArgs -} - -// parseExposedPorts parses exposed ports from container metadata and allocates host ports -func (c *SDKClientV4) parseExposedPorts(ctx context.Context, vmID string, metadata *builderv1.ImageMetadata) ([]network.PortMapping, error) { - // AIDEV-NOTE: Parse exposed ports and allocate host ports using network manager - if metadata == nil || len(metadata.ExposedPorts) == 0 { - return nil, nil - } - - // Use network manager to allocate ports - mappings, err := c.networkManager.AllocatePortsForVM(vmID, metadata.ExposedPorts) - if err != nil { - c.logger.ErrorContext(ctx, "failed to allocate ports for VM", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return nil, fmt.Errorf("failed to allocate ports for VM %s: %w", vmID, err) - } - - c.logger.InfoContext(ctx, "allocated ports for VM", - slog.String("vm_id", vmID), - slog.Int("port_count", len(mappings)), - ) - - return mappings, nil -} - -// configurePortForwarding sets up iptables rules for port forwarding -func (c *SDKClientV4) configurePortForwarding(ctx context.Context, vmID string, vmIP string, mappings []network.PortMapping) error { - // AIDEV-NOTE: Configure iptables rules for port forwarding - - if len(mappings) == 0 { - return nil - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "configuring port forwarding", - slog.String("vm_id", vmID), - slog.String("vm_ip", vmIP), - slog.Int("port_count", len(mappings)), - ) - - for _, mapping := range mappings { - // Add DNAT rule to forward host port to VM port - // iptables -t nat -A PREROUTING -p tcp --dport HOST_PORT -j DNAT --to-destination VM_IP:CONTAINER_PORT - dnatCmd := exec.Command("iptables", - "-t", "nat", - "-A", "PREROUTING", - "-p", mapping.Protocol, - "--dport", fmt.Sprintf("%d", mapping.HostPort), - "-j", "DNAT", - "--to-destination", fmt.Sprintf("%s:%d", vmIP, mapping.ContainerPort), - ) - - if output, err := dnatCmd.CombinedOutput(); err != nil { - c.logger.ErrorContext(ctx, "failed to add DNAT rule", - slog.String("error", err.Error()), - slog.String("output", string(output)), - slog.Int("host_port", mapping.HostPort), - slog.Int("container_port", mapping.ContainerPort), - ) - return fmt.Errorf("failed to add DNAT rule: %w", err) - } - - // Add FORWARD rule to allow traffic - // iptables -A FORWARD -p tcp -d VM_IP --dport CONTAINER_PORT -j ACCEPT - forwardCmd := exec.Command("iptables", - "-A", "FORWARD", - "-p", mapping.Protocol, - "-d", vmIP, - "--dport", fmt.Sprintf("%d", mapping.ContainerPort), - "-j", "ACCEPT", - ) - - if output, err := forwardCmd.CombinedOutput(); err != nil { - c.logger.ErrorContext(ctx, "failed to add FORWARD rule", - slog.String("error", err.Error()), - slog.String("output", string(output)), - slog.Int("container_port", mapping.ContainerPort), - ) - return fmt.Errorf("failed to add FORWARD rule: %w", err) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "configured port forwarding", - slog.Int("host_port", mapping.HostPort), - slog.Int("container_port", mapping.ContainerPort), - slog.String("protocol", mapping.Protocol), - slog.String("vm_ip", vmIP), - ) - } - - return nil -} - -// removePortForwarding removes iptables rules for a VM -func (c *SDKClientV4) removePortForwarding(ctx context.Context, vmID string, vmIP string, mappings []network.PortMapping) error { - // AIDEV-NOTE: Remove iptables rules when VM is deleted - - for _, mapping := range mappings { - // Remove DNAT rule - dnatCmd := exec.Command("iptables", - "-t", "nat", - "-D", "PREROUTING", - "-p", mapping.Protocol, - "--dport", fmt.Sprintf("%d", mapping.HostPort), - "-j", "DNAT", - "--to-destination", fmt.Sprintf("%s:%d", vmIP, mapping.ContainerPort), - ) - - if output, err := dnatCmd.CombinedOutput(); err != nil { - // Log but don't fail - rule might already be gone - c.logger.WarnContext(ctx, "failed to remove DNAT rule", - "error", err.Error(), - "output", string(output), - ) - } - - // Remove FORWARD rule - forwardCmd := exec.Command("iptables", - "-D", "FORWARD", - "-p", mapping.Protocol, - "-d", vmIP, - "--dport", fmt.Sprintf("%d", mapping.ContainerPort), - "-j", "ACCEPT", - ) - - if output, err := forwardCmd.CombinedOutput(); err != nil { - c.logger.WarnContext(ctx, "failed to remove FORWARD rule", - "error", err.Error(), - "output", string(output), - ) - } - } - - return nil -} - -// copyMetadataFilesForAssets copies metadata files alongside rootfs assets when using asset manager -func (c *SDKClientV4) copyMetadataFilesForAssets(ctx context.Context, vmID string, config *metaldv1.VmConfig, preparedPaths map[string]string, jailerRoot string) error { - // AIDEV-NOTE: When using asset manager, only rootfs files are copied, but we need metadata files too - // This function finds the original metadata files and copies them to the jailer root - - for _, disk := range config.GetStorage() { - if !disk.GetIsRootDevice() || disk.GetPath() == "" { - continue - } - - // Find the original rootfs path before asset preparation - originalRootfsPath := disk.GetPath() - - // Check if this disk was replaced by an asset - var preparedRootfsPath string - for _, path := range preparedPaths { - if strings.HasSuffix(path, ".ext4") || strings.HasSuffix(path, ".img") { - preparedRootfsPath = path - break - } - } - - if preparedRootfsPath == "" { - // No rootfs asset found, skip metadata copying - continue - } - - // Look for metadata file alongside the original rootfs - originalDir := filepath.Dir(originalRootfsPath) - originalBaseName := strings.TrimSuffix(filepath.Base(originalRootfsPath), filepath.Ext(originalRootfsPath)) - metadataSrcPath := filepath.Join(originalDir, originalBaseName+".metadata.json") - - // Check if metadata file exists - if _, err := os.Stat(metadataSrcPath); os.IsNotExist(err) { - c.logger.LogAttrs(ctx, slog.LevelDebug, "no metadata file found for asset", - slog.String("vm_id", vmID), - slog.String("original_rootfs", originalRootfsPath), - slog.String("expected_metadata", metadataSrcPath), - ) - continue - } - - // Copy metadata file to jailer root with the same base name as the prepared rootfs - preparedBaseName := strings.TrimSuffix(filepath.Base(preparedRootfsPath), filepath.Ext(preparedRootfsPath)) - metadataDstPath := filepath.Join(jailerRoot, preparedBaseName+".metadata.json") - - if err := copyFileWithOwnership(metadataSrcPath, metadataDstPath, int(c.jailerConfig.UID), int(c.jailerConfig.GID)); err != nil { - c.logger.WarnContext(ctx, "failed to copy metadata file", - slog.String("vm_id", vmID), - slog.String("src", metadataSrcPath), - slog.String("dst", metadataDstPath), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to copy metadata file %s: %w", metadataSrcPath, err) - } - - c.logger.InfoContext(ctx, "copied metadata file for asset", - slog.String("vm_id", vmID), - slog.String("src", metadataSrcPath), - slog.String("dst", metadataDstPath), - ) - } - - return nil -} - -// createContainerCmdFile creates /container.cmd file in VM chroot for metald-init -func (c *SDKClientV4) createContainerCmdFile(ctx context.Context, vmID string, metadata *builderv1.ImageMetadata) error { - // AIDEV-NOTE: Create container.cmd file containing the full command for metald-init - // Combines entrypoint and command from container metadata into JSON array - - if metadata == nil { - return fmt.Errorf("metadata is required") - } - - // Build full command array: entrypoint + command - var fullCmd []string - fullCmd = append(fullCmd, metadata.Entrypoint...) - fullCmd = append(fullCmd, metadata.Command...) - - if len(fullCmd) == 0 { - return fmt.Errorf("no entrypoint or command found in metadata") - } - - // Convert to JSON - cmdJSON, err := json.Marshal(fullCmd) - if err != nil { - return fmt.Errorf("failed to marshal command to JSON: %w", err) - } - - // AIDEV-NOTE: Write container.cmd into the rootfs.ext4 filesystem, not just chroot - // Mount the rootfs.ext4 temporarily to inject the container.cmd file - jailerRoot := filepath.Join(c.jailerConfig.ChrootBaseDir, "firecracker", vmID, "root") - rootfsPath := filepath.Join(jailerRoot, "rootfs.ext4") - - // Create temporary mount point - tmpMount := filepath.Join("/tmp", "rootfs-mount-"+vmID) - if err := os.MkdirAll(tmpMount, 0755); err != nil { - return fmt.Errorf("failed to create temp mount dir: %w", err) - } - defer os.RemoveAll(tmpMount) - - // Mount the rootfs.ext4 - mountCmd := exec.Command("mount", "-o", "loop", rootfsPath, tmpMount) - if err := mountCmd.Run(); err != nil { - return fmt.Errorf("failed to mount rootfs: %w", err) - } - defer func() { - umountCmd := exec.Command("umount", tmpMount) - umountCmd.Run() - }() - - // Write container.cmd into the mounted filesystem - containerCmdPath := filepath.Join(tmpMount, "container.cmd") - if err := os.WriteFile(containerCmdPath, cmdJSON, 0644); err != nil { - return fmt.Errorf("failed to write container.cmd to rootfs: %w", err) - } - - c.logger.LogAttrs(ctx, slog.LevelInfo, "created container.cmd file", - slog.String("vm_id", vmID), - slog.String("path", containerCmdPath), - slog.String("command", string(cmdJSON)), - ) - - return nil -} diff --git a/go/deploy/metald/internal/backend/firecracker/sdk_client_v4_test.go b/go/deploy/metald/internal/backend/firecracker/sdk_client_v4_test.go deleted file mode 100644 index f5dbaf7ae3..0000000000 --- a/go/deploy/metald/internal/backend/firecracker/sdk_client_v4_test.go +++ /dev/null @@ -1,156 +0,0 @@ -package firecracker - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" - assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" -) - -// MockAssetClient is a mock implementation of the assetmanager.Client interface -type MockAssetClient struct { - mock.Mock -} - -func (m *MockAssetClient) ListAssets(ctx context.Context, assetType assetv1.AssetType, labels map[string]string) ([]*assetv1.Asset, error) { - args := m.Called(ctx, assetType, labels) - if args.Get(0) == nil { - return nil, args.Error(1) - } - return args.Get(0).([]*assetv1.Asset), args.Error(1) -} - -func (m *MockAssetClient) PrepareAssets(ctx context.Context, assetIDs []string, targetPath string, vmID string) (map[string]string, error) { - args := m.Called(ctx, assetIDs, targetPath, vmID) - if args.Get(0) == nil { - return nil, args.Error(1) - } - return args.Get(0).(map[string]string), args.Error(1) -} - -func (m *MockAssetClient) AcquireAsset(ctx context.Context, assetID string, vmID string) (string, error) { - args := m.Called(ctx, assetID, vmID) - return args.String(0), args.Error(1) -} - -func (m *MockAssetClient) ReleaseAsset(ctx context.Context, leaseID string) error { - args := m.Called(ctx, leaseID) - return args.Error(0) -} - -func TestBuildAssetRequirements(t *testing.T) { - client := &SDKClientV4{} - - tests := []struct { - name string - config *metaldv1.VmConfig - expected int - }{ - { - name: "basic VM with kernel and rootfs", - config: &metaldv1.VmConfig{ - Boot: &metaldv1.BootConfig{ - KernelPath: "/path/to/kernel", - }, - Storage: []*metaldv1.StorageDevice{ - { - IsRootDevice: true, - Options: map[string]string{ - "docker_image": "ghcr.io/unkeyed/unkey:latest", - }, - }, - }, - }, - expected: 2, // kernel + rootfs - }, - { - name: "VM with docker image in metadata", - config: &metaldv1.VmConfig{ - Boot: &metaldv1.BootConfig{ - KernelPath: "/path/to/kernel", - }, - Storage: []*metaldv1.StorageDevice{ - { - IsRootDevice: true, - }, - }, - Metadata: map[string]string{ - "docker_image": "nginx:alpine", - }, - }, - expected: 2, // kernel + rootfs - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - reqs := client.buildAssetRequirements(tt.config) - assert.Equal(t, tt.expected, len(reqs)) - }) - } -} - -func TestMatchAssets(t *testing.T) { - client := &SDKClientV4{} - - // Test successful matching - reqs := []assetRequirement{ - { - Type: assetv1.AssetType_ASSET_TYPE_KERNEL, - Required: true, - }, - { - Type: assetv1.AssetType_ASSET_TYPE_ROOTFS, - Labels: map[string]string{ - "docker_image": "ghcr.io/unkeyed/unkey:latest", - }, - Required: true, - }, - } - - availableAssets := []*assetv1.Asset{ - { - Id: "kernel-123", - Type: assetv1.AssetType_ASSET_TYPE_KERNEL, - }, - { - Id: "rootfs-456", - Type: assetv1.AssetType_ASSET_TYPE_ROOTFS, - Labels: map[string]string{ - "docker_image": "ghcr.io/unkeyed/unkey:latest", - }, - }, - } - - mapping, err := client.matchAssets(reqs, availableAssets) - assert.NoError(t, err) - assert.NotNil(t, mapping) - assert.Equal(t, 2, len(mapping.AssetIDs())) - assert.Contains(t, mapping.AssetIDs(), "kernel-123") - assert.Contains(t, mapping.AssetIDs(), "rootfs-456") - - // Test missing required asset - reqsMissing := []assetRequirement{ - { - Type: assetv1.AssetType_ASSET_TYPE_ROOTFS, - Labels: map[string]string{ - "docker_image": "nonexistent:latest", - }, - Required: true, - }, - } - - _, err = client.matchAssets(reqsMissing, availableAssets) - assert.Error(t, err) - assert.Contains(t, err.Error(), "no matching asset found") -} - -// AIDEV-NOTE: These are basic unit tests for the asset integration. -// More comprehensive integration tests would require: -// 1. A running assetmanagerd instance or more sophisticated mocking -// 2. Tests for the full VM creation flow with asset preparation -// 3. Tests for lease acquisition and release -// 4. Tests for error handling and rollback scenarios diff --git a/go/deploy/metald/internal/backend/firecracker/shutdown.go b/go/deploy/metald/internal/backend/firecracker/shutdown.go new file mode 100644 index 0000000000..b14f77dbb5 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/shutdown.go @@ -0,0 +1,8 @@ +package firecracker + +import "context" + +// ShutdownVM gracefully shuts down a VM +func (c *Client) ShutdownVM(ctx context.Context, vmID string) error { + return c.ShutdownVMWithOptions(ctx, vmID, false, 30) +} diff --git a/go/deploy/metald/internal/backend/firecracker/types.go b/go/deploy/metald/internal/backend/firecracker/types.go index 19b8f74f5a..38f02d881d 100644 --- a/go/deploy/metald/internal/backend/firecracker/types.go +++ b/go/deploy/metald/internal/backend/firecracker/types.go @@ -1,4 +1,75 @@ package firecracker -// This file contains types shared across the firecracker backend -// Currently no shared types are needed +import ( + "context" + "log/slog" + + sdk "github.com/firecracker-microvm/firecracker-go-sdk" + "github.com/unkeyed/unkey/go/deploy/metald/internal/assetmanager" + "github.com/unkeyed/unkey/go/deploy/metald/internal/config" + "github.com/unkeyed/unkey/go/deploy/metald/internal/jailer" + "github.com/unkeyed/unkey/go/deploy/metald/internal/network" + assetv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/assetmanagerd/v1" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// Client implements the Backend interface using firecracker-go-sdk +// with integrated jailer functionality for secure VM isolation. +type Client struct { + logger *slog.Logger + assetClient assetmanager.Client + vmRegistry map[string]*VM + vmAssetLeases map[string][]string // VM ID -> asset lease IDs + jailer *jailer.Jailer + jailerConfig *config.JailerConfig + baseDir string + tracer trace.Tracer + meter metric.Meter + vmCreateCounter metric.Int64Counter + vmDeleteCounter metric.Int64Counter + vmBootCounter metric.Int64Counter + vmErrorCounter metric.Int64Counter +} + +// VM represents a VM managed by the Firecracker backend +type VM struct { + ID string + Config *metaldv1.VmConfig + State metaldv1.VmState + Machine *sdk.Machine + NetworkInfo *network.VMNetwork + CancelFunc context.CancelFunc + AssetMapping *assetMapping // Asset mapping for lease acquisition + AssetPaths map[string]string // Prepared asset paths +} + +// assetRequirement represents a required asset for VM creation +type assetRequirement struct { + Type assetv1.AssetType + Labels map[string]string + Required bool +} + +// assetMapping tracks the mapping between requirements and actual assets +type assetMapping struct { + requirements []assetRequirement + assets map[string]*assetv1.Asset // requirement index -> asset + assetIDs []string + leaseIDs []string +} + +func (am *assetMapping) AssetIDs() []string { + return am.assetIDs +} + +func (am *assetMapping) LeaseIDs() []string { + return am.leaseIDs +} + +// queryKey is used for grouping asset requirements by type and labels +type queryKey struct { + assetType assetv1.AssetType + labels string // Serialized labels for grouping +} diff --git a/go/deploy/metald/internal/backend/firecracker/validation.go b/go/deploy/metald/internal/backend/firecracker/validation.go new file mode 100644 index 0000000000..f912ccbf88 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/validation.go @@ -0,0 +1,132 @@ +package firecracker + +import ( + "fmt" + "net" + "regexp" +) + +// validatePath validates a file path to ensure it's safe to use +func validatePath(path string) error { + if path == "" { + return fmt.Errorf("path cannot be empty") + } + + // Check for path traversal attempts + if containsPathTraversal(path) { + return fmt.Errorf("path contains directory traversal: %s", path) + } + + return nil +} + +// containsPathTraversal checks if a path contains directory traversal patterns +func containsPathTraversal(path string) bool { + patterns := []string{ + "..", + "..\\", + "../", + "..\\", + } + + for _, pattern := range patterns { + if regexp.MustCompile(pattern).MatchString(path) { + return true + } + } + return false +} + +// validateMAC validates a MAC address format +func validateMAC(mac string) error { + if mac == "" { + return fmt.Errorf("MAC address cannot be empty") + } + + // Standard MAC address format: XX:XX:XX:XX:XX:XX + macRegex := regexp.MustCompile(`^([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}$`) + if !macRegex.MatchString(mac) { + return fmt.Errorf("invalid MAC address format: %s", mac) + } + + return nil +} + +// validateCIDR validates a CIDR notation +func validateCIDR(cidr string) error { + _, _, err := net.ParseCIDR(cidr) + if err != nil { + return fmt.Errorf("invalid CIDR notation: %s", cidr) + } + return nil +} + +// validateMemorySize validates memory size in bytes +func validateMemorySize(sizeBytes int64) error { + const ( + minMemory = 128 * 1024 * 1024 // 128 MB minimum + maxMemory = 512 * 1024 * 1024 * 1024 // 512 GB maximum + ) + + if sizeBytes < minMemory { + return fmt.Errorf("memory size %d bytes is below minimum of %d bytes (128MB)", sizeBytes, minMemory) + } + + if sizeBytes > maxMemory { + return fmt.Errorf("memory size %d bytes exceeds maximum of %d bytes (512GB)", sizeBytes, maxMemory) + } + + // Check if memory is a multiple of 1MB (Firecracker requirement) + if sizeBytes%(1024*1024) != 0 { + return fmt.Errorf("memory size must be a multiple of 1MB") + } + + return nil +} + +// validateVCPUCount validates the number of vCPUs +func validateVCPUCount(count int32) error { + if count < 1 { + return fmt.Errorf("vCPU count must be at least 1") + } + + if count > 32 { + return fmt.Errorf("vCPU count %d exceeds maximum of 32", count) + } + + // Firecracker works best with power-of-2 vCPU counts + if !isPowerOfTwo(int(count)) && count != 1 { + // This is a warning, not an error + // Log it but don't fail + } + + return nil +} + +// isPowerOfTwo checks if a number is a power of two +func isPowerOfTwo(n int) bool { + return n > 0 && (n&(n-1)) == 0 +} + +// validateDiskPath validates a disk image path +func validateDiskPath(path string) error { + if err := validatePath(path); err != nil { + return err + } + + // Check for supported disk image extensions + validExtensions := []string{".ext4", ".img", ".raw", ".qcow2"} + hasValidExt := false + for _, ext := range validExtensions { + if regexp.MustCompile(ext + "$").MatchString(path) { + hasValidExt = true + break + } + } + + if !hasValidExt { + return fmt.Errorf("disk path %s does not have a supported extension (.ext4, .img, .raw, .qcow2)", path) + } + + return nil +} diff --git a/go/deploy/metald/internal/backend/firecracker/vm_operations.go b/go/deploy/metald/internal/backend/firecracker/vm_operations.go new file mode 100644 index 0000000000..f686e82d15 --- /dev/null +++ b/go/deploy/metald/internal/backend/firecracker/vm_operations.go @@ -0,0 +1,224 @@ +package firecracker + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" +) + +// ShutdownVMWithOptions shuts down a VM with configurable options +func (c *Client) ShutdownVMWithOptions(ctx context.Context, vmID string, force bool, timeoutSeconds int32) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.shutdown_vm", + trace.WithAttributes( + attribute.String("vm_id", vmID), + attribute.Bool("force", force), + attribute.Int("timeout_seconds", int(timeoutSeconds)), + ), + ) + defer span.End() + + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + return err + } + + // Validate VM state before shutdown operation + if vm.State != metaldv1.VmState_VM_STATE_RUNNING { + err := fmt.Errorf("vm %s is in %s state, can only shutdown VMs in RUNNING state", vmID, vm.State.String()) + span.RecordError(err) + return err + } + + if vm.Machine == nil { + return fmt.Errorf("vm %s firecracker process not available", vmID) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "shutting down VM", + slog.String("vm_id", vmID), + slog.String("current_state", vm.State.String()), + slog.Bool("force", force), + slog.Int("timeout_seconds", int(timeoutSeconds)), + ) + + // Create a timeout context + shutdownCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSeconds)*time.Second) + defer cancel() + + if force { + // Force shutdown by pausing the VM to preserve the socket for resume + // Note: Using PauseVM instead of StopVMM to allow resume operations + if err := vm.Machine.PauseVM(shutdownCtx); err != nil { + span.RecordError(err) + return fmt.Errorf("failed to force shutdown VM: %w", err) + } + } else { + // Try graceful shutdown first by pausing the VM + // Note: Using PauseVM instead of Shutdown to preserve the firecracker process and socket + if err := vm.Machine.PauseVM(shutdownCtx); err != nil { + c.logger.WarnContext(ctx, "graceful shutdown failed", + "vm_id", vmID, + "error", err, + ) + span.RecordError(err) + return fmt.Errorf("failed to shutdown VM: %w", err) + } + } + + // Note: The firecracker process remains running to allow resume operations + + // Update state + vm.State = metaldv1.VmState_VM_STATE_SHUTDOWN + + c.logger.LogAttrs(ctx, slog.LevelInfo, "VM shutdown successfully", + slog.String("vm_id", vmID), + ) + + return nil +} + +// PauseVM pauses a running VM +func (c *Client) PauseVM(ctx context.Context, vmID string) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.pause_vm", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + return err + } + + // Validate VM state before pause operation + if vm.State != metaldv1.VmState_VM_STATE_RUNNING { + err := fmt.Errorf("vm %s is in %s state, can only pause VMs in RUNNING state", vmID, vm.State.String()) + span.RecordError(err) + return err + } + + if vm.Machine == nil { + return fmt.Errorf("vm %s firecracker process not available", vmID) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "pausing VM", + slog.String("vm_id", vmID), + slog.String("current_state", vm.State.String()), + ) + + if err := vm.Machine.PauseVM(ctx); err != nil { + span.RecordError(err) + return fmt.Errorf("failed to pause VM: %w", err) + } + + vm.State = metaldv1.VmState_VM_STATE_PAUSED + + c.logger.LogAttrs(ctx, slog.LevelInfo, "VM paused successfully", + slog.String("vm_id", vmID), + ) + + return nil +} + +// ResumeVM resumes a paused or shutdown VM +func (c *Client) ResumeVM(ctx context.Context, vmID string) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.resume_vm", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + return err + } + + // Validate VM state before resume operation - allow both PAUSED and SHUTDOWN + if vm.State != metaldv1.VmState_VM_STATE_PAUSED && vm.State != metaldv1.VmState_VM_STATE_SHUTDOWN { + err := fmt.Errorf("vm %s is in %s state, can only resume VMs in PAUSED or SHUTDOWN state", vmID, vm.State.String()) + span.RecordError(err) + return err + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "resuming VM", + slog.String("vm_id", vmID), + slog.String("current_state", vm.State.String()), + ) + + if err := vm.Machine.ResumeVM(ctx); err != nil { + span.RecordError(err) + return fmt.Errorf("failed to resume VM: %w", err) + } + + vm.State = metaldv1.VmState_VM_STATE_RUNNING + + c.logger.LogAttrs(ctx, slog.LevelInfo, "VM resumed successfully", + slog.String("vm_id", vmID), + ) + + return nil +} + +// RebootVM reboots a running VM +func (c *Client) RebootVM(ctx context.Context, vmID string) error { + ctx, span := c.tracer.Start(ctx, "metald.firecracker.reboot_vm", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + c.logger.LogAttrs(ctx, slog.LevelInfo, "rebooting VM", + slog.String("vm_id", vmID), + ) + + // Shutdown the VM + if err := c.ShutdownVMWithOptions(ctx, vmID, false, 30); err != nil { + span.RecordError(err) + return fmt.Errorf("failed to shutdown VM for reboot: %w", err) + } + + // Wait a moment + time.Sleep(1 * time.Second) + + // Resume the VM (since we paused it in shutdown) + if err := c.ResumeVM(ctx, vmID); err != nil { + span.RecordError(err) + return fmt.Errorf("failed to resume VM after shutdown: %w", err) + } + + c.logger.LogAttrs(ctx, slog.LevelInfo, "VM rebooted successfully", + slog.String("vm_id", vmID), + ) + + return nil +} + +// GetVMInfo returns information about a VM +func (c *Client) GetVMInfo(ctx context.Context, vmID string) (*types.VMInfo, error) { + _, span := c.tracer.Start(ctx, "metald.firecracker.get_vm_info", + trace.WithAttributes(attribute.String("vm_id", vmID)), + ) + defer span.End() + + vm, exists := c.vmRegistry[vmID] + if !exists { + err := fmt.Errorf("vm %s not found", vmID) + span.RecordError(err) + return nil, err + } + + info := &types.VMInfo{ + Config: vm.Config, + State: vm.State, + } + + return info, nil +} diff --git a/go/deploy/metald/internal/backend/types/backend.go b/go/deploy/metald/internal/backend/types/backend.go index 3b6c8675fb..555133f149 100644 --- a/go/deploy/metald/internal/backend/types/backend.go +++ b/go/deploy/metald/internal/backend/types/backend.go @@ -4,11 +4,10 @@ import ( "context" "time" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" ) // Backend defines the interface for hypervisor backends -// AIDEV-NOTE: This interface abstracts VM operations for all hypervisor types type Backend interface { // CreateVM creates a new VM instance with the given configuration CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) @@ -46,9 +45,8 @@ type Backend interface { // VMInfo contains VM state and configuration information type VMInfo struct { - Config *metaldv1.VmConfig - State metaldv1.VmState - NetworkInfo *metaldv1.VmNetworkInfo // Optional network information + Config *metaldv1.VmConfig + State metaldv1.VmState } // ListableVMInfo represents VM information for listing operations diff --git a/go/deploy/metald/internal/billing/client.go b/go/deploy/metald/internal/billing/client.go index 93857983e3..3239980caf 100644 --- a/go/deploy/metald/internal/billing/client.go +++ b/go/deploy/metald/internal/billing/client.go @@ -9,11 +9,11 @@ import ( "time" "connectrpc.com/connect" - billingv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/billaged/v1" - "github.com/unkeyed/unkey/go/gen/proto/deploy/billaged/v1/billagedv1connect" "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" "github.com/unkeyed/unkey/go/deploy/metald/internal/observability" "github.com/unkeyed/unkey/go/deploy/pkg/observability/interceptors" + billingv1 "github.com/unkeyed/unkey/go/gen/proto/deploy/billaged/v1" + "github.com/unkeyed/unkey/go/gen/proto/deploy/billaged/v1/billagedv1connect" "google.golang.org/protobuf/types/known/timestamppb" ) diff --git a/go/deploy/metald/internal/config/config.go b/go/deploy/metald/internal/config/config.go index b4d8c651e7..00669acdef 100644 --- a/go/deploy/metald/internal/config/config.go +++ b/go/deploy/metald/internal/config/config.go @@ -5,7 +5,6 @@ import ( "log/slog" "os" "strconv" - "strings" "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" ) @@ -30,10 +29,7 @@ type Config struct { // AssetManager configuration AssetManager AssetManagerConfig - // Network configuration - Network NetworkConfig - - // TLS configuration (optional, defaults to disabled) + // TLS configuration TLS *TLSConfig } @@ -48,10 +44,10 @@ type ServerConfig struct { // BackendConfig holds backend-specific configuration type BackendConfig struct { - // Type of backend (firecracker only for now) + // Type of backend Type types.BackendType - // Jailer configuration (required for production) + // Jailer configuration Jailer JailerConfig } @@ -130,39 +126,6 @@ type AssetManagerConfig struct { CacheDir string } -// NetworkConfig holds network-related configuration -type NetworkConfig struct { - // Enabled indicates if networking is enabled - Enabled bool - - // IPv4 Configuration - EnableIPv4 bool - BridgeIPv4 string - VMSubnetIPv4 string - DNSServersIPv4 []string - - // IPv6 Configuration - EnableIPv6 bool - BridgeIPv6 string - VMSubnetIPv6 string - DNSServersIPv6 []string - IPv6Mode string // "dual-stack", "ipv6-only", "ipv4-only" - - // Common Configuration - BridgeName string - EnableRateLimit bool - RateLimitMbps int - - // Production Scalability Configuration - MaxVMsPerBridge int // Maximum VMs per bridge before creating new bridge - EnableMultiBridge bool // Enable multiple bridges for scalability - BridgePrefix string // Prefix for multiple bridges (e.g., "metald-br") - - // Host Protection Configuration - EnableHostProtection bool // Enable host network route protection - PrimaryInterface string // Primary host interface to protect (auto-detected if empty) -} - // TLSConfig holds TLS configuration type TLSConfig struct { // Mode can be "file" or "spiffe" (default: "spiffe") @@ -194,9 +157,6 @@ func LoadConfigWithSocketPath(socketPath string) (*Config, error) { // LoadConfigWithSocketPathAndLogger loads configuration with optional socket path override and custom logger func LoadConfigWithSocketPathAndLogger(socketPath string, logger *slog.Logger) (*Config, error) { - // AIDEV-NOTE: Socket endpoints are now managed by process manager - // No need for endpoint configuration - // Parse sampling rate samplingRate := 1.0 if samplingStr := os.Getenv("UNKEY_METALD_OTEL_SAMPLING_RATE"); samplingStr != "" { @@ -249,8 +209,6 @@ func LoadConfigWithSocketPathAndLogger(socketPath string, logger *slog.Logger) ( } } - // AIDEV-BUSINESS_RULE: Jailer is always required for production security - // Parse jailer UID/GID jailerUID := uint32(1000) if uidStr := os.Getenv("UNKEY_METALD_JAILER_UID"); uidStr != "" { @@ -276,10 +234,6 @@ func LoadConfigWithSocketPathAndLogger(socketPath string, logger *slog.Logger) ( } } - // AIDEV-NOTE: Namespace isolation is always enabled for security - - // AIDEV-NOTE: Resource limits are applied at container/VM level, not jailer level - // Parse billing configuration billingEnabled := true // Default to enabled if enabledStr := os.Getenv("UNKEY_METALD_BILLING_ENABLED"); enabledStr != "" { @@ -345,30 +299,6 @@ func LoadConfigWithSocketPathAndLogger(socketPath string, logger *slog.Logger) ( Endpoint: getEnvOrDefault("UNKEY_METALD_ASSETMANAGER_ENDPOINT", "http://localhost:8083"), CacheDir: getEnvOrDefault("UNKEY_METALD_ASSETMANAGER_CACHE_DIR", "/opt/metald/assets"), }, - Network: NetworkConfig{ - Enabled: getEnvBoolOrDefault("UNKEY_METALD_NETWORK_ENABLED"), - EnableIPv4: getEnvBoolOrDefault("UNKEY_METALD_NETWORK_IPV4_ENABLED"), - BridgeIPv4: getEnvOrDefault("UNKEY_METALD_NETWORK_BRIDGE_IPV4", "172.31.0.1/19"), - VMSubnetIPv4: getEnvOrDefault("UNKEY_METALD_NETWORK_VM_SUBNET_IPV4", "172.31.0.0/19"), - DNSServersIPv4: strings.Split(getEnvOrDefault("UNKEY_METALD_NETWORK_DNS_IPV4", "8.8.8.8,8.8.4.4"), ","), - EnableIPv6: getEnvBoolOrDefault("UNKEY_METALD_NETWORK_IPV6_ENABLED"), - BridgeIPv6: getEnvOrDefault("UNKEY_METALD_NETWORK_BRIDGE_IPV6", "fd00::1/64"), - VMSubnetIPv6: getEnvOrDefault("UNKEY_METALD_NETWORK_VM_SUBNET_IPV6", "fd00::/64"), - DNSServersIPv6: strings.Split(getEnvOrDefault("UNKEY_METALD_NETWORK_DNS_IPV6", "2606:4700:4700::1111,2606:4700:4700::1001"), ","), - IPv6Mode: getEnvOrDefault("UNKEY_METALD_NETWORK_IPV6_MODE", "dual-stack"), - BridgeName: getEnvOrDefault("UNKEY_METALD_NETWORK_BRIDGE", "br-vms"), - EnableRateLimit: getEnvBoolOrDefault("UNKEY_METALD_NETWORK_RATE_LIMIT"), - RateLimitMbps: getEnvIntOrDefault("UNKEY_METALD_NETWORK_RATE_LIMIT_MBPS", 1000), - - // Production Scalability Defaults - MaxVMsPerBridge: getEnvIntOrDefault("UNKEY_METALD_NETWORK_MAX_VMS_PER_BRIDGE", 1000), - EnableMultiBridge: getEnvBoolOrDefault("UNKEY_METALD_NETWORK_MULTI_BRIDGE"), - BridgePrefix: getEnvOrDefault("UNKEY_METALD_NETWORK_BRIDGE_PREFIX", "metald-br"), - - // Host Protection Defaults - EnableHostProtection: getEnvBoolOrDefault("UNKEY_METALD_NETWORK_HOST_PROTECTION"), - PrimaryInterface: getEnvOrDefault("UNKEY_METALD_NETWORK_PRIMARY_INTERFACE", ""), - }, TLS: &TLSConfig{ // AIDEV-BUSINESS_RULE: mTLS/SPIFFE is required for production security Mode: getEnvOrDefault("UNKEY_METALD_TLS_MODE", "spiffe"), @@ -396,8 +326,6 @@ func (c *Config) Validate() error { return fmt.Errorf("only firecracker and docker backends are supported, got: %s", c.Backend.Type) } - // AIDEV-NOTE: Comprehensive unit tests implemented in config_test.go - // Tests cover: parsing, validation, edge cases, default values, and error conditions if c.OpenTelemetry.Enabled { if c.OpenTelemetry.TracingSamplingRate < 0.0 || c.OpenTelemetry.TracingSamplingRate > 1.0 { return fmt.Errorf("tracing sampling rate must be between 0.0 and 1.0, got %f", c.OpenTelemetry.TracingSamplingRate) diff --git a/go/deploy/metald/internal/config/config_test.go b/go/deploy/metald/internal/config/config_test.go deleted file mode 100644 index fa93df26a7..0000000000 --- a/go/deploy/metald/internal/config/config_test.go +++ /dev/null @@ -1,584 +0,0 @@ -package config - -import ( - "os" - "strings" - "testing" - - "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" -) - -func TestLoadConfig(t *testing.T) { - tests := []struct { - name string - envVars map[string]string - want *Config - wantErr bool - }{ - { - name: "default configuration", - envVars: map[string]string{}, - want: &Config{ - Server: ServerConfig{ - Port: "8080", - Address: "0.0.0.0", - }, - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - Jailer: JailerConfig{ - UID: 1000, - GID: 1000, - ChrootBaseDir: "/srv/jailer", - }, - }, - Billing: BillingConfig{ - Enabled: true, - Endpoint: "http://localhost:8081", - MockMode: false, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: false, - ServiceName: "metald", - ServiceVersion: "0.1.0", - TracingSamplingRate: 1.0, - OTLPEndpoint: "localhost:4318", - PrometheusEnabled: true, - PrometheusPort: "9464", - PrometheusInterface: "127.0.0.1", - HighCardinalityLabelsEnabled: false, - }, - Database: DatabaseConfig{ - DataDir: "/opt/metald/data", - }, - AssetManager: AssetManagerConfig{ - Enabled: true, - Endpoint: "http://localhost:8083", - CacheDir: "/opt/metald/assets", - }, - Network: NetworkConfig{ - Enabled: true, - EnableIPv4: true, - BridgeIPv4: "10.100.0.1/16", - VMSubnetIPv4: "10.100.0.0/16", - DNSServersIPv4: []string{"8.8.8.8", "8.8.4.4"}, - EnableIPv6: true, - BridgeIPv6: "fd00::1/64", - VMSubnetIPv6: "fd00::/64", - DNSServersIPv6: []string{"2606:4700:4700::1111", "2606:4700:4700::1001"}, - IPv6Mode: "dual-stack", - BridgeName: "br-vms", - EnableRateLimit: true, - RateLimitMbps: 1000, - }, - TLS: &TLSConfig{ - Mode: "spiffe", - CertFile: "", - KeyFile: "", - CAFile: "", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - EnableCertCaching: true, - CertCacheTTL: "5s", - }, - }, - wantErr: false, - }, - { - name: "custom server configuration", - envVars: map[string]string{ - "UNKEY_METALD_PORT": "9999", - "UNKEY_METALD_ADDRESS": "127.0.0.1", - }, - want: &Config{ - Server: ServerConfig{ - Port: "9999", - Address: "127.0.0.1", - }, - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - Jailer: JailerConfig{ - UID: 1000, - GID: 1000, - ChrootBaseDir: "/srv/jailer", - }, - }, - Billing: BillingConfig{ - Enabled: true, - Endpoint: "http://localhost:8081", - MockMode: false, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: false, - ServiceName: "metald", - ServiceVersion: "0.1.0", - TracingSamplingRate: 1.0, - OTLPEndpoint: "localhost:4318", - PrometheusEnabled: true, - PrometheusPort: "9464", - PrometheusInterface: "127.0.0.1", - HighCardinalityLabelsEnabled: false, - }, - Database: DatabaseConfig{ - DataDir: "/opt/metald/data", - }, - AssetManager: AssetManagerConfig{ - Enabled: true, - Endpoint: "http://localhost:8083", - CacheDir: "/opt/metald/assets", - }, - Network: NetworkConfig{ - Enabled: true, - EnableIPv4: true, - BridgeIPv4: "10.100.0.1/16", - VMSubnetIPv4: "10.100.0.0/16", - DNSServersIPv4: []string{"8.8.8.8", "8.8.4.4"}, - EnableIPv6: true, - BridgeIPv6: "fd00::1/64", - VMSubnetIPv6: "fd00::/64", - DNSServersIPv6: []string{"2606:4700:4700::1111", "2606:4700:4700::1001"}, - IPv6Mode: "dual-stack", - BridgeName: "br-vms", - EnableRateLimit: true, - RateLimitMbps: 1000, - }, - TLS: &TLSConfig{ - Mode: "spiffe", - CertFile: "", - KeyFile: "", - CAFile: "", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - EnableCertCaching: true, - CertCacheTTL: "5s", - }, - }, - wantErr: false, - }, - { - name: "custom jailer configuration", - envVars: map[string]string{ - "UNKEY_METALD_JAILER_UID": "2000", - "UNKEY_METALD_JAILER_GID": "2000", - "UNKEY_METALD_JAILER_CHROOT_DIR": "/var/lib/jailer", - }, - want: &Config{ - Server: ServerConfig{ - Port: "8080", - Address: "0.0.0.0", - }, - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - Jailer: JailerConfig{ - UID: 2000, - GID: 2000, - ChrootBaseDir: "/var/lib/jailer", - }, - }, - Billing: BillingConfig{ - Enabled: true, - Endpoint: "http://localhost:8081", - MockMode: false, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: false, - ServiceName: "metald", - ServiceVersion: "0.1.0", - TracingSamplingRate: 1.0, - OTLPEndpoint: "localhost:4318", - PrometheusEnabled: true, - PrometheusPort: "9464", - PrometheusInterface: "127.0.0.1", - HighCardinalityLabelsEnabled: false, - }, - Database: DatabaseConfig{ - DataDir: "/opt/metald/data", - }, - AssetManager: AssetManagerConfig{ - Enabled: true, - Endpoint: "http://localhost:8083", - CacheDir: "/opt/metald/assets", - }, - Network: NetworkConfig{ - Enabled: true, - EnableIPv4: true, - BridgeIPv4: "10.100.0.1/16", - VMSubnetIPv4: "10.100.0.0/16", - DNSServersIPv4: []string{"8.8.8.8", "8.8.4.4"}, - EnableIPv6: true, - BridgeIPv6: "fd00::1/64", - VMSubnetIPv6: "fd00::/64", - DNSServersIPv6: []string{"2606:4700:4700::1111", "2606:4700:4700::1001"}, - IPv6Mode: "dual-stack", - BridgeName: "br-vms", - EnableRateLimit: true, - RateLimitMbps: 1000, - }, - TLS: &TLSConfig{ - Mode: "spiffe", - CertFile: "", - KeyFile: "", - CAFile: "", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - EnableCertCaching: true, - CertCacheTTL: "5s", - }, - }, - wantErr: false, - }, - { - name: "opentelemetry enabled with custom config", - envVars: map[string]string{ - "UNKEY_METALD_OTEL_ENABLED": "true", - "UNKEY_METALD_OTEL_SERVICE_NAME": "test-service", - "UNKEY_METALD_OTEL_SERVICE_VERSION": "2.0.0", - "UNKEY_METALD_OTEL_SAMPLING_RATE": "0.5", - "UNKEY_METALD_OTEL_ENDPOINT": "otel-collector:4318", - "UNKEY_METALD_OTEL_PROMETHEUS_ENABLED": "false", - "UNKEY_METALD_OTEL_PROMETHEUS_PORT": "8888", - }, - want: &Config{ - Server: ServerConfig{ - Port: "8080", - Address: "0.0.0.0", - }, - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - Jailer: JailerConfig{ - UID: 1000, - GID: 1000, - ChrootBaseDir: "/srv/jailer", - }, - }, - Billing: BillingConfig{ - Enabled: true, - Endpoint: "http://localhost:8081", - MockMode: false, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: true, - ServiceName: "test-service", - ServiceVersion: "2.0.0", - TracingSamplingRate: 0.5, - OTLPEndpoint: "otel-collector:4318", - PrometheusEnabled: false, - PrometheusPort: "8888", - PrometheusInterface: "127.0.0.1", - HighCardinalityLabelsEnabled: false, - }, - Database: DatabaseConfig{ - DataDir: "/opt/metald/data", - }, - AssetManager: AssetManagerConfig{ - Enabled: true, - Endpoint: "http://localhost:8083", - CacheDir: "/opt/metald/assets", - }, - Network: NetworkConfig{ - Enabled: true, - EnableIPv4: true, - BridgeIPv4: "10.100.0.1/16", - VMSubnetIPv4: "10.100.0.0/16", - DNSServersIPv4: []string{"8.8.8.8", "8.8.4.4"}, - EnableIPv6: true, - BridgeIPv6: "fd00::1/64", - VMSubnetIPv6: "fd00::/64", - DNSServersIPv6: []string{"2606:4700:4700::1111", "2606:4700:4700::1001"}, - IPv6Mode: "dual-stack", - BridgeName: "br-vms", - EnableRateLimit: true, - RateLimitMbps: 1000, - }, - TLS: &TLSConfig{ - Mode: "spiffe", - CertFile: "", - KeyFile: "", - CAFile: "", - SPIFFESocketPath: "/var/lib/spire/agent/agent.sock", - EnableCertCaching: true, - CertCacheTTL: "5s", - }, - }, - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Clear environment before test - clearEnv() - - // Set test environment variables - for key, value := range tt.envVars { - os.Setenv(key, value) - } - defer clearEnv() // Clean up after test - - got, err := LoadConfig() - if (err != nil) != tt.wantErr { - t.Errorf("LoadConfig() error = %v, wantErr %v", err, tt.wantErr) - return - } - - if tt.wantErr { - return // Don't check config if we expected an error - } - - if !compareConfigs(got, tt.want) { - t.Errorf("LoadConfig() got = %+v, want %+v", got, tt.want) - } - }) - } -} - -func TestOpenTelemetryConfigValidation(t *testing.T) { - tests := []struct { - name string - envVars map[string]string - wantErr bool - errMsg string - }{ - { - name: "valid sampling rate 0.0", - envVars: map[string]string{ - "UNKEY_METALD_OTEL_ENABLED": "true", - "UNKEY_METALD_OTEL_SAMPLING_RATE": "0.0", - }, - wantErr: false, - }, - { - name: "valid sampling rate 1.0", - envVars: map[string]string{ - "UNKEY_METALD_OTEL_ENABLED": "true", - "UNKEY_METALD_OTEL_SAMPLING_RATE": "1.0", - }, - wantErr: false, - }, - { - name: "valid sampling rate 0.5", - envVars: map[string]string{ - "UNKEY_METALD_OTEL_ENABLED": "true", - "UNKEY_METALD_OTEL_SAMPLING_RATE": "0.5", - }, - wantErr: false, - }, - { - name: "invalid sampling rate negative", - envVars: map[string]string{ - "UNKEY_METALD_OTEL_ENABLED": "true", - "UNKEY_METALD_OTEL_SAMPLING_RATE": "-0.5", - }, - wantErr: true, - errMsg: "tracing sampling rate must be between 0.0 and 1.0", - }, - { - name: "invalid sampling rate too high", - envVars: map[string]string{ - "UNKEY_METALD_OTEL_ENABLED": "true", - "UNKEY_METALD_OTEL_SAMPLING_RATE": "1.5", - }, - wantErr: true, - errMsg: "tracing sampling rate must be between 0.0 and 1.0", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Clear environment before test - clearEnv() - - // Set test environment variables - for key, value := range tt.envVars { - os.Setenv(key, value) - } - defer clearEnv() - - _, err := LoadConfig() - if (err != nil) != tt.wantErr { - t.Errorf("LoadConfig() error = %v, wantErr %v", err, tt.wantErr) - return - } - - if tt.wantErr && err != nil && tt.errMsg != "" { - if !strings.Contains(err.Error(), tt.errMsg) { - t.Errorf("LoadConfig() error = %v, want error containing %v", err, tt.errMsg) - } - } - }) - } -} - -func TestConfigValidation(t *testing.T) { - tests := []struct { - name string - config *Config - wantErr bool - errMsg string - }{ - { - name: "valid firecracker backend", - config: &Config{ - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: false, - }, - }, - wantErr: false, - }, - { - name: "invalid backend type", - config: &Config{ - Backend: BackendConfig{ - Type: types.BackendTypeCloudHypervisor, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: false, - }, - }, - wantErr: true, - errMsg: "only firecracker backend is supported", - }, - { - name: "otel enabled with valid config", - config: &Config{ - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: true, - TracingSamplingRate: 0.5, - OTLPEndpoint: "localhost:4318", - ServiceName: "test-service", - }, - }, - wantErr: false, - }, - { - name: "otel enabled without service name", - config: &Config{ - Backend: BackendConfig{ - Type: types.BackendTypeFirecracker, - }, - OpenTelemetry: OpenTelemetryConfig{ - Enabled: true, - TracingSamplingRate: 0.5, - OTLPEndpoint: "localhost:4318", - ServiceName: "", - }, - }, - wantErr: true, - errMsg: "service name is required when OpenTelemetry is enabled", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := tt.config.Validate() - if (err != nil) != tt.wantErr { - t.Errorf("Config.Validate() error = %v, wantErr %v", err, tt.wantErr) - return - } - - if tt.wantErr && err != nil && tt.errMsg != "" { - if !strings.Contains(err.Error(), tt.errMsg) { - t.Errorf("Config.Validate() error = %v, want error containing %v", err, tt.errMsg) - } - } - }) - } -} - -// Helper functions - -func clearEnv() { - // Clear all UNKEY_METALD_* environment variables - for _, env := range os.Environ() { - if strings.HasPrefix(env, "UNKEY_METALD_") { - key := strings.Split(env, "=")[0] - os.Unsetenv(key) - } - } -} - -func compareConfigs(a, b *Config) bool { - // Compare server config - if a.Server != b.Server { - return false - } - - // Compare backend config - if a.Backend.Type != b.Backend.Type { - return false - } - if a.Backend.Jailer != b.Backend.Jailer { - return false - } - - // Compare process manager config - - // Compare billing config - if a.Billing != b.Billing { - return false - } - - // Compare OpenTelemetry config - if a.OpenTelemetry != b.OpenTelemetry { - return false - } - - // Compare database config - if a.Database != b.Database { - return false - } - - // Compare AssetManager config - if a.AssetManager != b.AssetManager { - return false - } - - // Compare Network config - if a.Network.Enabled != b.Network.Enabled || - a.Network.EnableIPv4 != b.Network.EnableIPv4 || - a.Network.BridgeIPv4 != b.Network.BridgeIPv4 || - a.Network.VMSubnetIPv4 != b.Network.VMSubnetIPv4 || - !stringSlicesEqual(a.Network.DNSServersIPv4, b.Network.DNSServersIPv4) || - a.Network.EnableIPv6 != b.Network.EnableIPv6 || - a.Network.BridgeIPv6 != b.Network.BridgeIPv6 || - a.Network.VMSubnetIPv6 != b.Network.VMSubnetIPv6 || - !stringSlicesEqual(a.Network.DNSServersIPv6, b.Network.DNSServersIPv6) || - a.Network.IPv6Mode != b.Network.IPv6Mode || - a.Network.BridgeName != b.Network.BridgeName || - a.Network.EnableRateLimit != b.Network.EnableRateLimit || - a.Network.RateLimitMbps != b.Network.RateLimitMbps { - return false - } - - // Compare TLS config - if (a.TLS == nil) != (b.TLS == nil) { - return false - } - if a.TLS != nil && b.TLS != nil { - if a.TLS.Mode != b.TLS.Mode || - a.TLS.CertFile != b.TLS.CertFile || - a.TLS.KeyFile != b.TLS.KeyFile || - a.TLS.CAFile != b.TLS.CAFile || - a.TLS.SPIFFESocketPath != b.TLS.SPIFFESocketPath || - a.TLS.EnableCertCaching != b.TLS.EnableCertCaching || - a.TLS.CertCacheTTL != b.TLS.CertCacheTTL { - return false - } - } - - return true -} - -func stringSlicesEqual(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} diff --git a/go/deploy/metald/internal/database/database.go b/go/deploy/metald/internal/database/database.go deleted file mode 100644 index 3c7f727af6..0000000000 --- a/go/deploy/metald/internal/database/database.go +++ /dev/null @@ -1,144 +0,0 @@ -package database - -import ( - "context" - "database/sql" - _ "embed" - "fmt" - "log/slog" - "os" - "path/filepath" - - _ "github.com/mattn/go-sqlite3" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/trace" -) - -//go:embed schema.sql -var schema string - -// Database wraps the SQLite connection with VM-specific operations -type Database struct { - db *sql.DB - tracer trace.Tracer - logger *slog.Logger -} - -// New creates a new database connection and ensures schema is up to date -func New(dataDir string) (*Database, error) { - return NewWithLogger(dataDir, slog.Default()) -} - -// NewWithLogger creates a new database connection with a custom logger -func NewWithLogger(dataDir string, logger *slog.Logger) (*Database, error) { - // Ensure data directory exists with secure permissions - if err := os.MkdirAll(dataDir, 0700); err != nil { - return nil, fmt.Errorf("failed to create data directory: %w", err) - } - - // Open SQLite database - dbPath := filepath.Join(dataDir, "metald.db") - db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_synchronous=NORMAL&_cache_size=-64000&_foreign_keys=ON") - if err != nil { - return nil, fmt.Errorf("failed to open database: %w", err) - } - - // Configure connection pool for high-scale deployment - db.SetMaxOpenConns(25) // Limit concurrent connections - db.SetMaxIdleConns(5) // Maintain idle connections for reuse - db.SetConnMaxLifetime(0) // Keep connections alive (SQLite benefit) - - // Test connection - if err := db.Ping(); err != nil { - db.Close() - return nil, fmt.Errorf("failed to ping database: %w", err) - } - - database := &Database{ - db: db, - tracer: otel.Tracer("metald/database"), - logger: logger.With("component", "database"), - } - - // Apply schema - if err := database.migrate(); err != nil { - db.Close() - return nil, fmt.Errorf("failed to migrate database: %w", err) - } - - database.logger.Info("database initialized successfully", - slog.String("path", dbPath), - ) - - return database, nil -} - -// migrate applies the database schema -func (d *Database) migrate() error { - _, span := d.tracer.Start(context.Background(), "database.migrate") - defer span.End() - - d.logger.Debug("applying database schema") - - // Apply base schema - _, err := d.db.Exec(schema) - if err != nil { - span.RecordError(err) - d.logger.Error("failed to apply database schema", - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to apply schema: %w", err) - } - - // Apply additional migrations for port mappings - if err := d.migratePortMappings(); err != nil { - span.RecordError(err) - d.logger.Error("failed to migrate port mappings", - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to migrate port mappings: %w", err) - } - - d.logger.Debug("database schema applied successfully") - return nil -} - -// migratePortMappings adds port_mappings column if it doesn't exist -func (d *Database) migratePortMappings() error { - // Check if port_mappings column exists - var columnExists bool - err := d.db.QueryRow(` - SELECT COUNT(*) > 0 - FROM pragma_table_info('vms') - WHERE name = 'port_mappings' - `).Scan(&columnExists) - if err != nil { - return fmt.Errorf("failed to check for port_mappings column: %w", err) - } - - if !columnExists { - d.logger.Info("adding port_mappings column to vms table") - _, err := d.db.Exec("ALTER TABLE vms ADD COLUMN port_mappings TEXT DEFAULT '[]'") - if err != nil { - return fmt.Errorf("failed to add port_mappings column: %w", err) - } - d.logger.Info("port_mappings column added successfully") - } else { - d.logger.Debug("port_mappings column already exists") - } - - return nil -} - -// Close closes the database connection -func (d *Database) Close() error { - if d.db != nil { - return d.db.Close() - } - return nil -} - -// DB returns the underlying sql.DB for advanced operations -func (d *Database) DB() *sql.DB { - return d.db -} diff --git a/go/deploy/metald/internal/database/db.go b/go/deploy/metald/internal/database/db.go new file mode 100644 index 0000000000..ef3e100691 --- /dev/null +++ b/go/deploy/metald/internal/database/db.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.29.0 + +package database + +import ( + "context" + "database/sql" +) + +type DBTX interface { + ExecContext(context.Context, string, ...interface{}) (sql.Result, error) + PrepareContext(context.Context, string) (*sql.Stmt, error) + QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) + QueryRowContext(context.Context, string, ...interface{}) *sql.Row +} + +func New(db DBTX) *Queries { + return &Queries{db: db} +} + +type Queries struct { + db DBTX +} + +func (q *Queries) WithTx(tx *sql.Tx) *Queries { + return &Queries{ + db: tx, + } +} diff --git a/go/deploy/metald/internal/database/handler.go b/go/deploy/metald/internal/database/handler.go new file mode 100644 index 0000000000..700bc05d2e --- /dev/null +++ b/go/deploy/metald/internal/database/handler.go @@ -0,0 +1,45 @@ +package database + +import ( + "database/sql" + "fmt" + "log/slog" + "os" + "path/filepath" + + _ "github.com/mattn/go-sqlite3" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/trace" +) + +type Database struct { + Queries Querier + tracer trace.Tracer + logger *slog.Logger +} + +func NewDatabase(dataDir string) (*Database, error) { + return NewDatabaseWithLogger(dataDir, slog.Default()) +} + +func NewDatabaseWithLogger(dataDir string, logger *slog.Logger) (*Database, error) { + if err := os.MkdirAll(dataDir, 0o700); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + dbPath := filepath.Join(dataDir, "metald.db") + db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_synchronous=NORMAL&_cache_size=-64000&_foreign_keys=ON") + if err != nil { + return nil, fmt.Errorf("failed to open database: %w", err) + } + + queries := New(db) + + database := &Database{ + Queries: queries, + tracer: otel.Tracer("metald/database"), + logger: logger, + } + + return database, nil +} diff --git a/go/deploy/metald/internal/database/models.go b/go/deploy/metald/internal/database/models.go new file mode 100644 index 0000000000..a9e60da788 --- /dev/null +++ b/go/deploy/metald/internal/database/models.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.29.0 + +package database + +import ( + "database/sql" +) + +type IpAllocation struct { + ID int64 `db:"id" json:"id"` + VmID string `db:"vm_id" json:"vm_id"` + IpAddr string `db:"ip_addr" json:"ip_addr"` + NetworkAllocationID int64 `db:"network_allocation_id" json:"network_allocation_id"` + AllocatedAt sql.NullTime `db:"allocated_at" json:"allocated_at"` +} + +type Network struct { + ID int64 `db:"id" json:"id"` + BaseNetwork string `db:"base_network" json:"base_network"` + IsAllocated int64 `db:"is_allocated" json:"is_allocated"` +} + +type NetworkAllocation struct { + ID int64 `db:"id" json:"id"` + DeploymentID string `db:"deployment_id" json:"deployment_id"` + NetworkID int64 `db:"network_id" json:"network_id"` + AvailableIps string `db:"available_ips" json:"available_ips"` + AllocatedAt sql.NullTime `db:"allocated_at" json:"allocated_at"` +} diff --git a/go/deploy/metald/internal/database/querier.go b/go/deploy/metald/internal/database/querier.go new file mode 100644 index 0000000000..4726a5d9a6 --- /dev/null +++ b/go/deploy/metald/internal/database/querier.go @@ -0,0 +1,28 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.29.0 + +package database + +import ( + "context" +) + +type Querier interface { + AllocateIP(ctx context.Context, arg AllocateIPParams) (IpAllocation, error) + // queries.sql + AllocateNetwork(ctx context.Context) (Network, error) + CreateNetworkAllocation(ctx context.Context, arg CreateNetworkAllocationParams) (NetworkAllocation, error) + DeleteIPAllocationsForNetwork(ctx context.Context, networkAllocationID int64) error + DeleteNetworkAllocation(ctx context.Context, deploymentID string) error + GetAvailableIPCount(ctx context.Context, deploymentID string) (interface{}, error) + GetIPAllocation(ctx context.Context, vmID string) (IpAllocation, error) + GetNetworkAllocation(ctx context.Context, deploymentID string) (GetNetworkAllocationRow, error) + GetNetworkStats(ctx context.Context) (GetNetworkStatsRow, error) + PopAvailableIPJSON(ctx context.Context, deploymentID string) (PopAvailableIPJSONRow, error) + ReleaseIP(ctx context.Context, vmID string) error + ReleaseNetwork(ctx context.Context, id int64) error + ReturnIPJSON(ctx context.Context, arg ReturnIPJSONParams) error +} + +var _ Querier = (*Queries)(nil) diff --git a/go/deploy/metald/internal/database/queries.sql.go b/go/deploy/metald/internal/database/queries.sql.go new file mode 100644 index 0000000000..974543b2d8 --- /dev/null +++ b/go/deploy/metald/internal/database/queries.sql.go @@ -0,0 +1,245 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.29.0 +// source: queries.sql + +package database + +import ( + "context" + "database/sql" +) + +const allocateIP = `-- name: AllocateIP :one +INSERT INTO ip_allocations (vm_id, ip_addr, network_allocation_id) +VALUES (?, ?, ?) +RETURNING id, vm_id, ip_addr, network_allocation_id, allocated_at +` + +type AllocateIPParams struct { + VmID string `db:"vm_id" json:"vm_id"` + IpAddr string `db:"ip_addr" json:"ip_addr"` + NetworkAllocationID int64 `db:"network_allocation_id" json:"network_allocation_id"` +} + +func (q *Queries) AllocateIP(ctx context.Context, arg AllocateIPParams) (IpAllocation, error) { + row := q.db.QueryRowContext(ctx, allocateIP, arg.VmID, arg.IpAddr, arg.NetworkAllocationID) + var i IpAllocation + err := row.Scan( + &i.ID, + &i.VmID, + &i.IpAddr, + &i.NetworkAllocationID, + &i.AllocatedAt, + ) + return i, err +} + +const allocateNetwork = `-- name: AllocateNetwork :one + +UPDATE networks +SET is_allocated = 1 +WHERE id = ( + SELECT id FROM networks + WHERE is_allocated = 0 + ORDER BY id + LIMIT 1 +) +RETURNING id, base_network, is_allocated +` + +// queries.sql +func (q *Queries) AllocateNetwork(ctx context.Context) (Network, error) { + row := q.db.QueryRowContext(ctx, allocateNetwork) + var i Network + err := row.Scan(&i.ID, &i.BaseNetwork, &i.IsAllocated) + return i, err +} + +const createNetworkAllocation = `-- name: CreateNetworkAllocation :one +INSERT INTO network_allocations (deployment_id, network_id, available_ips) +VALUES (?, ?, ?) +RETURNING id, deployment_id, network_id, available_ips, allocated_at +` + +type CreateNetworkAllocationParams struct { + DeploymentID string `db:"deployment_id" json:"deployment_id"` + NetworkID int64 `db:"network_id" json:"network_id"` + AvailableIps string `db:"available_ips" json:"available_ips"` +} + +func (q *Queries) CreateNetworkAllocation(ctx context.Context, arg CreateNetworkAllocationParams) (NetworkAllocation, error) { + row := q.db.QueryRowContext(ctx, createNetworkAllocation, arg.DeploymentID, arg.NetworkID, arg.AvailableIps) + var i NetworkAllocation + err := row.Scan( + &i.ID, + &i.DeploymentID, + &i.NetworkID, + &i.AvailableIps, + &i.AllocatedAt, + ) + return i, err +} + +const deleteIPAllocationsForNetwork = `-- name: DeleteIPAllocationsForNetwork :exec +DELETE FROM ip_allocations +WHERE network_allocation_id = ? +` + +func (q *Queries) DeleteIPAllocationsForNetwork(ctx context.Context, networkAllocationID int64) error { + _, err := q.db.ExecContext(ctx, deleteIPAllocationsForNetwork, networkAllocationID) + return err +} + +const deleteNetworkAllocation = `-- name: DeleteNetworkAllocation :exec +DELETE FROM network_allocations +WHERE deployment_id = ? +` + +func (q *Queries) DeleteNetworkAllocation(ctx context.Context, deploymentID string) error { + _, err := q.db.ExecContext(ctx, deleteNetworkAllocation, deploymentID) + return err +} + +const getAvailableIPCount = `-- name: GetAvailableIPCount :one +SELECT json_array_length(available_ips) as count +FROM network_allocations +WHERE deployment_id = ? +` + +func (q *Queries) GetAvailableIPCount(ctx context.Context, deploymentID string) (interface{}, error) { + row := q.db.QueryRowContext(ctx, getAvailableIPCount, deploymentID) + var count interface{} + err := row.Scan(&count) + return count, err +} + +const getIPAllocation = `-- name: GetIPAllocation :one +SELECT id, vm_id, ip_addr, network_allocation_id, allocated_at FROM ip_allocations WHERE vm_id = ? +` + +func (q *Queries) GetIPAllocation(ctx context.Context, vmID string) (IpAllocation, error) { + row := q.db.QueryRowContext(ctx, getIPAllocation, vmID) + var i IpAllocation + err := row.Scan( + &i.ID, + &i.VmID, + &i.IpAddr, + &i.NetworkAllocationID, + &i.AllocatedAt, + ) + return i, err +} + +const getNetworkAllocation = `-- name: GetNetworkAllocation :one +SELECT na.id, na.deployment_id, na.network_id, na.available_ips, na.allocated_at, n.base_network +FROM network_allocations na +JOIN networks n ON na.network_id = n.id +WHERE na.deployment_id = ? +` + +type GetNetworkAllocationRow struct { + ID int64 `db:"id" json:"id"` + DeploymentID string `db:"deployment_id" json:"deployment_id"` + NetworkID int64 `db:"network_id" json:"network_id"` + AvailableIps string `db:"available_ips" json:"available_ips"` + AllocatedAt sql.NullTime `db:"allocated_at" json:"allocated_at"` + BaseNetwork string `db:"base_network" json:"base_network"` +} + +func (q *Queries) GetNetworkAllocation(ctx context.Context, deploymentID string) (GetNetworkAllocationRow, error) { + row := q.db.QueryRowContext(ctx, getNetworkAllocation, deploymentID) + var i GetNetworkAllocationRow + err := row.Scan( + &i.ID, + &i.DeploymentID, + &i.NetworkID, + &i.AvailableIps, + &i.AllocatedAt, + &i.BaseNetwork, + ) + return i, err +} + +const getNetworkStats = `-- name: GetNetworkStats :one +SELECT + (SELECT COUNT(*) FROM networks) as total_networks, + (SELECT COUNT(*) FROM networks WHERE is_allocated = 0) as available_networks, + (SELECT COUNT(*) FROM network_allocations) as active_deployments, + (SELECT COUNT(*) FROM ip_allocations) as allocated_ips +` + +type GetNetworkStatsRow struct { + TotalNetworks int64 `db:"total_networks" json:"total_networks"` + AvailableNetworks int64 `db:"available_networks" json:"available_networks"` + ActiveDeployments int64 `db:"active_deployments" json:"active_deployments"` + AllocatedIps int64 `db:"allocated_ips" json:"allocated_ips"` +} + +func (q *Queries) GetNetworkStats(ctx context.Context) (GetNetworkStatsRow, error) { + row := q.db.QueryRowContext(ctx, getNetworkStats) + var i GetNetworkStatsRow + err := row.Scan( + &i.TotalNetworks, + &i.AvailableNetworks, + &i.ActiveDeployments, + &i.AllocatedIps, + ) + return i, err +} + +const popAvailableIPJSON = `-- name: PopAvailableIPJSON :one +UPDATE network_allocations +SET available_ips = json_remove(available_ips, '$[0]') +WHERE deployment_id = ? +AND json_array_length(available_ips) > 0 +RETURNING json_extract(available_ips, '$[0]') as allocated_ip, id +` + +type PopAvailableIPJSONRow struct { + JsonExtract interface{} `db:"json_extract" json:"json_extract"` + ID int64 `db:"id" json:"id"` +} + +func (q *Queries) PopAvailableIPJSON(ctx context.Context, deploymentID string) (PopAvailableIPJSONRow, error) { + row := q.db.QueryRowContext(ctx, popAvailableIPJSON, deploymentID) + var i PopAvailableIPJSONRow + err := row.Scan(&i.JsonExtract, &i.ID) + return i, err +} + +const releaseIP = `-- name: ReleaseIP :exec +DELETE FROM ip_allocations WHERE vm_id = ? +` + +func (q *Queries) ReleaseIP(ctx context.Context, vmID string) error { + _, err := q.db.ExecContext(ctx, releaseIP, vmID) + return err +} + +const releaseNetwork = `-- name: ReleaseNetwork :exec +UPDATE networks +SET is_allocated = 0 +WHERE id = ? +` + +func (q *Queries) ReleaseNetwork(ctx context.Context, id int64) error { + _, err := q.db.ExecContext(ctx, releaseNetwork, id) + return err +} + +const returnIPJSON = `-- name: ReturnIPJSON :exec +UPDATE network_allocations +SET available_ips = json_insert(available_ips, '$[#]', ?) +WHERE deployment_id = ? +` + +type ReturnIPJSONParams struct { + JsonInsert interface{} `db:"json_insert" json:"json_insert"` + DeploymentID string `db:"deployment_id" json:"deployment_id"` +} + +func (q *Queries) ReturnIPJSON(ctx context.Context, arg ReturnIPJSONParams) error { + _, err := q.db.ExecContext(ctx, returnIPJSON, arg.JsonInsert, arg.DeploymentID) + return err +} diff --git a/go/deploy/metald/internal/database/repository.go b/go/deploy/metald/internal/database/repository.go deleted file mode 100644 index 598b6bd666..0000000000 --- a/go/deploy/metald/internal/database/repository.go +++ /dev/null @@ -1,630 +0,0 @@ -package database - -import ( - "context" - "database/sql" - "fmt" - "log/slog" - "time" - - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/trace" - "google.golang.org/protobuf/proto" -) - -// VMRepository handles VM state persistence operations -type VMRepository struct { - db *Database - logger *slog.Logger -} - -// NewVMRepository creates a new VM repository -func NewVMRepository(db *Database) *VMRepository { - return &VMRepository{ - db: db, - logger: db.logger.With("component", "vm_repository"), - } -} - -// VM represents the database model for a VM -type VM struct { - ID string - CustomerID string - Config []byte // serialized protobuf - State metaldv1.VmState - ProcessID *string - PortMappings string // JSON serialized port mappings - CreatedAt time.Time - UpdatedAt time.Time - DeletedAt *time.Time - - // Parsed configuration (populated by ListVMsByCustomerWithContext) - ParsedConfig *metaldv1.VmConfig -} - -// CreateVM inserts a new VM record -func (r *VMRepository) CreateVM(vmID, customerID string, config *metaldv1.VmConfig, state metaldv1.VmState) error { - return r.CreateVMWithContext(context.Background(), vmID, customerID, config, state) -} - -// CreateVMWithContext inserts a new VM record with context for tracing -func (r *VMRepository) CreateVMWithContext(ctx context.Context, vmID, customerID string, config *metaldv1.VmConfig, state metaldv1.VmState) error { - _, span := r.db.tracer.Start(ctx, "vm_repository.create_vm", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("vm.customer_id", customerID), - attribute.String("vm.state", state.String()), - ), - ) - defer span.End() - - r.logger.DebugContext(ctx, "creating VM record", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - slog.String("state", state.String()), - ) - configBytes, err := proto.Marshal(config) - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to marshal VM config", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to marshal VM config: %w", err) - } - - query := ` - INSERT INTO vms (id, customer_id, config, state, port_mappings, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) - ` - - _, err = r.db.db.Exec(query, vmID, customerID, configBytes, int32(state), "[]") - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to insert VM record", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to create VM: %w", err) - } - - r.logger.InfoContext(ctx, "VM record created successfully", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - slog.String("state", state.String()), - ) - - return nil -} - -// GetVM retrieves a VM by ID -func (r *VMRepository) GetVM(vmID string) (*VM, error) { - return r.GetVMWithContext(context.Background(), vmID) -} - -// GetVMWithContext retrieves a VM by ID with context for tracing -func (r *VMRepository) GetVMWithContext(ctx context.Context, vmID string) (*VM, error) { - _, span := r.db.tracer.Start(ctx, "vm_repository.get_vm", - trace.WithAttributes( - attribute.String("vm.id", vmID), - ), - ) - defer span.End() - - r.logger.DebugContext(ctx, "retrieving VM record", - slog.String("vm_id", vmID), - ) - query := ` - SELECT id, customer_id, config, state, process_id, port_mappings, created_at, updated_at, deleted_at - FROM vms - WHERE id = ? AND deleted_at IS NULL - ` - - var vm VM - var processID sql.NullString - var portMappings sql.NullString - var deletedAt sql.NullTime - - err := r.db.db.QueryRow(query, vmID).Scan( - &vm.ID, - &vm.CustomerID, - &vm.Config, - &vm.State, - &processID, - &portMappings, - &vm.CreatedAt, - &vm.UpdatedAt, - &deletedAt, - ) - - if err != nil { - if err == sql.ErrNoRows { - r.logger.DebugContext(ctx, "VM not found", - slog.String("vm_id", vmID), - ) - return nil, fmt.Errorf("VM not found: %s", vmID) - } - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to query VM record", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return nil, fmt.Errorf("failed to get VM: %w", err) - } - - if processID.Valid { - vm.ProcessID = &processID.String - } - if portMappings.Valid { - vm.PortMappings = portMappings.String - } else { - vm.PortMappings = "[]" // Default empty array - } - if deletedAt.Valid { - vm.DeletedAt = &deletedAt.Time - } - - r.logger.DebugContext(ctx, "VM record retrieved successfully", - slog.String("vm_id", vmID), - slog.String("customer_id", vm.CustomerID), - slog.String("state", vm.State.String()), - ) - - span.SetAttributes( - attribute.String("vm.customer_id", vm.CustomerID), - attribute.String("vm.state", vm.State.String()), - ) - - return &vm, nil -} - -// UpdateVMState updates the VM state and optionally the process ID -func (r *VMRepository) UpdateVMState(vmID string, state metaldv1.VmState, processID *string) error { - return r.UpdateVMStateWithContext(context.Background(), vmID, state, processID) -} - -// UpdateVMStateWithContext updates the VM state and optionally the process ID with context for tracing -func (r *VMRepository) UpdateVMStateWithContext(ctx context.Context, vmID string, state metaldv1.VmState, processID *string) error { - _, span := r.db.tracer.Start(ctx, "vm_repository.update_vm_state", - trace.WithAttributes( - attribute.String("vm.id", vmID), - attribute.String("vm.state", state.String()), - ), - ) - defer span.End() - - r.logger.DebugContext(ctx, "updating VM state", - slog.String("vm_id", vmID), - slog.String("state", state.String()), - slog.Any("process_id", processID), - ) - query := ` - UPDATE vms - SET state = ?, process_id = ?, updated_at = CURRENT_TIMESTAMP - WHERE id = ? AND deleted_at IS NULL - ` - - result, err := r.db.db.Exec(query, int32(state), processID, vmID) - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to update VM state", - slog.String("vm_id", vmID), - slog.String("state", state.String()), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to update VM state: %w", err) - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to get rows affected", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to get rows affected: %w", err) - } - - if rowsAffected == 0 { - r.logger.WarnContext(ctx, "VM not found or already deleted during state update", - slog.String("vm_id", vmID), - slog.String("state", state.String()), - ) - return fmt.Errorf("VM not found or already deleted: %s", vmID) - } - - r.logger.InfoContext(ctx, "VM state updated successfully", - slog.String("vm_id", vmID), - slog.String("state", state.String()), - slog.Int64("rows_affected", rowsAffected), - ) - - span.SetAttributes(attribute.Int64("db.rows_affected", rowsAffected)) - - return nil -} - -// ListVMs retrieves VMs with optional filters -func (r *VMRepository) ListVMs(customerID *string, states []metaldv1.VmState, limit, offset int) ([]*VM, error) { - baseQuery := ` - SELECT id, customer_id, config, state, process_id, port_mappings, created_at, updated_at, deleted_at - FROM vms - WHERE deleted_at IS NULL - ` - args := []interface{}{} - - // Add customer filter - if customerID != nil { - baseQuery += " AND customer_id = ?" - args = append(args, *customerID) - } - - // Add state filters - if len(states) > 0 { - baseQuery += " AND state IN (" - for i, state := range states { - if i > 0 { - baseQuery += ", " - } - baseQuery += "?" - args = append(args, int32(state)) - } - baseQuery += ")" - } - - // Add ordering and pagination - baseQuery += " ORDER BY created_at DESC" - if limit > 0 { - baseQuery += " LIMIT ?" - args = append(args, limit) - } - if offset > 0 { - baseQuery += " OFFSET ?" - args = append(args, offset) - } - - rows, err := r.db.db.Query(baseQuery, args...) - if err != nil { - return nil, fmt.Errorf("failed to list VMs: %w", err) - } - defer rows.Close() - - var vms []*VM - for rows.Next() { - var vm VM - var processID sql.NullString - var portMappings sql.NullString - var deletedAt sql.NullTime - - err := rows.Scan( - &vm.ID, - &vm.CustomerID, - &vm.Config, - &vm.State, - &processID, - &portMappings, - &vm.CreatedAt, - &vm.UpdatedAt, - &deletedAt, - ) - if err != nil { - return nil, fmt.Errorf("failed to scan VM row: %w", err) - } - - if processID.Valid { - vm.ProcessID = &processID.String - } - if portMappings.Valid { - vm.PortMappings = portMappings.String - } else { - vm.PortMappings = "[]" // Default empty array - } - if deletedAt.Valid { - vm.DeletedAt = &deletedAt.Time - } - - vms = append(vms, &vm) - } - - if err := rows.Err(); err != nil { - return nil, fmt.Errorf("error iterating VM rows: %w", err) - } - - return vms, nil -} - -// ListVMsByCustomerWithContext lists all VMs for a specific customer with context for tracing -func (r *VMRepository) ListVMsByCustomerWithContext(ctx context.Context, customerID string) ([]*VM, error) { - _, span := r.db.tracer.Start(ctx, "vm_repository.list_vms_by_customer", - trace.WithAttributes( - attribute.String("customer.id", customerID), - ), - ) - defer span.End() - - r.logger.DebugContext(ctx, "listing VMs for customer", - slog.String("customer_id", customerID), - ) - - // Use existing ListVMs method with customer filter - vms, err := r.ListVMs(&customerID, nil, 0, 0) - if err != nil { - span.RecordError(err) - return nil, err - } - - // Deserialize configs for service layer - for _, vm := range vms { - if len(vm.Config) > 0 { - var config metaldv1.VmConfig - if err := proto.Unmarshal(vm.Config, &config); err != nil { - r.logger.ErrorContext(ctx, "failed to unmarshal VM config", - slog.String("vm_id", vm.ID), - slog.String("error", err.Error()), - ) - continue - } - vm.ParsedConfig = &config - } - } - - r.logger.DebugContext(ctx, "listed VMs for customer", - slog.String("customer_id", customerID), - slog.Int("count", len(vms)), - ) - - return vms, nil -} - -// DeleteVM soft deletes a VM by setting deleted_at -func (r *VMRepository) DeleteVM(vmID string) error { - return r.DeleteVMWithContext(context.Background(), vmID) -} - -// DeleteVMWithContext soft deletes a VM by setting deleted_at with context for tracing -func (r *VMRepository) DeleteVMWithContext(ctx context.Context, vmID string) error { - _, span := r.db.tracer.Start(ctx, "vm_repository.delete_vm", - trace.WithAttributes( - attribute.String("vm.id", vmID), - ), - ) - defer span.End() - - r.logger.DebugContext(ctx, "deleting VM record", - slog.String("vm_id", vmID), - ) - query := ` - UPDATE vms - SET deleted_at = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP - WHERE id = ? AND deleted_at IS NULL - ` - - result, err := r.db.db.Exec(query, vmID) - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to delete VM record", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to delete VM: %w", err) - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to get rows affected", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to get rows affected: %w", err) - } - - if rowsAffected == 0 { - r.logger.WarnContext(ctx, "VM not found or already deleted during deletion", - slog.String("vm_id", vmID), - ) - return fmt.Errorf("VM not found or already deleted: %s", vmID) - } - - r.logger.InfoContext(ctx, "VM record deleted successfully", - slog.String("vm_id", vmID), - slog.Int64("rows_affected", rowsAffected), - ) - - span.SetAttributes(attribute.Int64("db.rows_affected", rowsAffected)) - - return nil -} - -// GetVMConfig unmarshals and returns the VM configuration -func (vm *VM) GetVMConfig() (*metaldv1.VmConfig, error) { - var config metaldv1.VmConfig - if err := proto.Unmarshal(vm.Config, &config); err != nil { - return nil, fmt.Errorf("failed to unmarshal VM config: %w", err) - } - return &config, nil -} - -// CountVMs returns the total count of VMs with optional filters -func (r *VMRepository) CountVMs(customerID *string, states []metaldv1.VmState) (int64, error) { - baseQuery := "SELECT COUNT(*) FROM vms WHERE deleted_at IS NULL" - args := []interface{}{} - - // Add customer filter - if customerID != nil { - baseQuery += " AND customer_id = ?" - args = append(args, *customerID) - } - - // Add state filters - if len(states) > 0 { - baseQuery += " AND state IN (" - for i, state := range states { - if i > 0 { - baseQuery += ", " - } - baseQuery += "?" - args = append(args, int32(state)) - } - baseQuery += ")" - } - - var count int64 - err := r.db.db.QueryRow(baseQuery, args...).Scan(&count) - if err != nil { - return 0, fmt.Errorf("failed to count VMs: %w", err) - } - - return count, nil -} - -// UpdateVMPortMappings updates the port mappings for a VM -func (r *VMRepository) UpdateVMPortMappings(vmID string, portMappingsJSON string) error { - return r.UpdateVMPortMappingsWithContext(context.Background(), vmID, portMappingsJSON) -} - -// UpdateVMPortMappingsWithContext updates the port mappings for a VM with context for tracing -func (r *VMRepository) UpdateVMPortMappingsWithContext(ctx context.Context, vmID string, portMappingsJSON string) error { - _, span := r.db.tracer.Start(ctx, "vm_repository.update_vm_port_mappings", - trace.WithAttributes( - attribute.String("vm.id", vmID), - ), - ) - defer span.End() - - r.logger.DebugContext(ctx, "updating VM port mappings", - slog.String("vm_id", vmID), - slog.String("port_mappings", portMappingsJSON), - ) - - query := ` - UPDATE vms - SET port_mappings = ?, updated_at = CURRENT_TIMESTAMP - WHERE id = ? AND deleted_at IS NULL - ` - - result, err := r.db.db.Exec(query, portMappingsJSON, vmID) - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to update VM port mappings", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to update VM port mappings: %w", err) - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to get rows affected", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to get rows affected: %w", err) - } - - if rowsAffected == 0 { - r.logger.WarnContext(ctx, "VM not found or already deleted during port mappings update", - slog.String("vm_id", vmID), - ) - return fmt.Errorf("VM not found or already deleted: %s", vmID) - } - - r.logger.InfoContext(ctx, "VM port mappings updated successfully", - slog.String("vm_id", vmID), - slog.Int64("rows_affected", rowsAffected), - ) - - span.SetAttributes(attribute.Int64("db.rows_affected", rowsAffected)) - - return nil -} - -// ListAllVMsWithContext retrieves all VMs from the database with context for tracing -func (r *VMRepository) ListAllVMsWithContext(ctx context.Context) ([]*VM, error) { - _, span := r.db.tracer.Start(ctx, "vm_repository.list_all_vms") - defer span.End() - - r.logger.DebugContext(ctx, "listing all VMs from database") - - query := ` - SELECT id, customer_id, config, state, process_id, port_mappings, created_at, updated_at, deleted_at - FROM vms - WHERE deleted_at IS NULL - ORDER BY created_at DESC - ` - - rows, err := r.db.db.QueryContext(ctx, query) - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to query all VMs", - slog.String("error", err.Error()), - ) - return nil, fmt.Errorf("failed to list all VMs: %w", err) - } - defer rows.Close() - - var vms []*VM - for rows.Next() { - var vm VM - var processID sql.NullString - var portMappings sql.NullString - var deletedAt sql.NullTime - - err := rows.Scan( - &vm.ID, - &vm.CustomerID, - &vm.Config, - &vm.State, - &processID, - &portMappings, - &vm.CreatedAt, - &vm.UpdatedAt, - &deletedAt, - ) - if err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "failed to scan VM row", - slog.String("error", err.Error()), - ) - return nil, fmt.Errorf("failed to scan VM row: %w", err) - } - - if processID.Valid { - vm.ProcessID = &processID.String - } - if portMappings.Valid { - vm.PortMappings = portMappings.String - } else { - vm.PortMappings = "[]" // Default empty array - } - if deletedAt.Valid { - vm.DeletedAt = &deletedAt.Time - } - - vms = append(vms, &vm) - } - - if err := rows.Err(); err != nil { - span.RecordError(err) - r.logger.ErrorContext(ctx, "error iterating VM rows", - slog.String("error", err.Error()), - ) - return nil, fmt.Errorf("error iterating VM rows: %w", err) - } - - r.logger.InfoContext(ctx, "successfully listed all VMs from database", - slog.Int("count", len(vms)), - ) - - span.SetAttributes(attribute.Int("vm.count", len(vms))) - - return vms, nil -} - -// UpdateVMStateWithContextInt updates VM state with an integer state parameter (used by reconciler) -func (r *VMRepository) UpdateVMStateWithContextInt(ctx context.Context, vmID string, state int) error { - return r.UpdateVMStateWithContext(ctx, vmID, metaldv1.VmState(state), nil) -} diff --git a/go/deploy/metald/internal/database/schema.sql b/go/deploy/metald/internal/database/schema.sql deleted file mode 100644 index 6be23161bd..0000000000 --- a/go/deploy/metald/internal/database/schema.sql +++ /dev/null @@ -1,23 +0,0 @@ --- VM state storage schema -CREATE TABLE IF NOT EXISTS vms ( - id TEXT PRIMARY KEY, - customer_id TEXT NOT NULL, - config BLOB NOT NULL, - state INTEGER NOT NULL DEFAULT 0, - process_id TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - deleted_at TIMESTAMP -); - --- Index for efficient customer queries -CREATE INDEX IF NOT EXISTS idx_vms_customer_id ON vms(customer_id); - --- Index for state queries -CREATE INDEX IF NOT EXISTS idx_vms_state ON vms(state); - --- Index for process queries -CREATE INDEX IF NOT EXISTS idx_vms_process_id ON vms(process_id); - --- Composite index for customer + state queries -CREATE INDEX IF NOT EXISTS idx_vms_customer_state ON vms(customer_id, state); \ No newline at end of file diff --git a/go/deploy/metald/internal/health/handler_test.go b/go/deploy/metald/internal/health/handler_test.go new file mode 100644 index 0000000000..bf94afcf00 --- /dev/null +++ b/go/deploy/metald/internal/health/handler_test.go @@ -0,0 +1,347 @@ +package health + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "log/slog" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" +) + +// mockBackend is a mock implementation of types.Backend for testing +type mockBackend struct { + mock.Mock +} + +func (m *mockBackend) CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) { + args := m.Called(ctx, config) + return args.String(0), args.Error(1) +} + +func (m *mockBackend) DeleteVM(ctx context.Context, vmID string) error { + args := m.Called(ctx, vmID) + return args.Error(0) +} + +func (m *mockBackend) BootVM(ctx context.Context, vmID string) error { + args := m.Called(ctx, vmID) + return args.Error(0) +} + +func (m *mockBackend) ShutdownVM(ctx context.Context, vmID string) error { + args := m.Called(ctx, vmID) + return args.Error(0) +} + +func (m *mockBackend) ShutdownVMWithOptions(ctx context.Context, vmID string, force bool, timeoutSeconds int32) error { + args := m.Called(ctx, vmID, force, timeoutSeconds) + return args.Error(0) +} + +func (m *mockBackend) PauseVM(ctx context.Context, vmID string) error { + args := m.Called(ctx, vmID) + return args.Error(0) +} + +func (m *mockBackend) ResumeVM(ctx context.Context, vmID string) error { + args := m.Called(ctx, vmID) + return args.Error(0) +} + +func (m *mockBackend) RebootVM(ctx context.Context, vmID string) error { + args := m.Called(ctx, vmID) + return args.Error(0) +} + +func (m *mockBackend) GetVMInfo(ctx context.Context, vmID string) (*types.VMInfo, error) { + args := m.Called(ctx, vmID) + if args.Get(0) == nil { + return nil, args.Error(1) + } + return args.Get(0).(*types.VMInfo), args.Error(1) +} + +func (m *mockBackend) GetVMMetrics(ctx context.Context, vmID string) (*types.VMMetrics, error) { + args := m.Called(ctx, vmID) + if args.Get(0) == nil { + return nil, args.Error(1) + } + return args.Get(0).(*types.VMMetrics), args.Error(1) +} + +func (m *mockBackend) Ping(ctx context.Context) error { + args := m.Called(ctx) + return args.Error(0) +} + +func TestNewHandler(t *testing.T) { + backend := &mockBackend{} + logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + startTime := time.Now() + + handler := NewHandler(backend, logger, startTime) + + assert.NotNil(t, handler) + assert.Equal(t, backend, handler.backend) + assert.Equal(t, startTime, handler.startTime) + assert.NotNil(t, handler.logger) +} + +func TestHandler_ServeHTTP_Healthy(t *testing.T) { + backend := &mockBackend{} + backend.On("Ping", mock.Anything).Return(nil) + + logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + handler := NewHandler(backend, logger, time.Now()) + + req := httptest.NewRequest("GET", "/health", nil) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + assert.Equal(t, "application/json", w.Header().Get("Content-Type")) + assert.Equal(t, "no-cache, no-store, must-revalidate", w.Header().Get("Cache-Control")) + + var response HealthResponse + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + assert.Equal(t, StatusHealthy, response.Status) + assert.Equal(t, "dev", response.Version) + assert.Equal(t, "firecracker", response.Backend.Type) + assert.Equal(t, StatusHealthy, response.Backend.Status) + assert.Empty(t, response.Backend.Error) + assert.NotNil(t, response.System) + assert.Contains(t, response.Checks, "backend_ping") + assert.Contains(t, response.Checks, "system_info") + + backend.AssertExpectations(t) +} + +func TestHandler_ServeHTTP_Unhealthy(t *testing.T) { + backend := &mockBackend{} + backendErr := errors.New("backend unavailable") + backend.On("Ping", mock.Anything).Return(backendErr) + + logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + handler := NewHandler(backend, logger, time.Now()) + + req := httptest.NewRequest("GET", "/health", nil) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + assert.Equal(t, "application/json", w.Header().Get("Content-Type")) + + var response HealthResponse + err := json.Unmarshal(w.Body.Bytes(), &response) + require.NoError(t, err) + + // The status will be "degraded" because backend failure creates a check that fails + // which overrides the backend unhealthy status with degraded + assert.Equal(t, StatusDegraded, response.Status) + assert.Equal(t, "firecracker", response.Backend.Type) + assert.Equal(t, StatusUnhealthy, response.Backend.Status) + assert.Equal(t, "backend unavailable", response.Backend.Error) + + backend.AssertExpectations(t) +} + +func TestHandler_performHealthChecks(t *testing.T) { + tests := []struct { + name string + backendPingErr error + expectedStatus string + expectedChecks int + }{ + { + name: "all healthy", + backendPingErr: nil, + expectedStatus: StatusHealthy, + expectedChecks: 2, // backend_ping + system_info + }, + { + name: "backend unhealthy", + backendPingErr: errors.New("ping failed"), + expectedStatus: StatusDegraded, // Backend failure creates unhealthy check which sets status to degraded + expectedChecks: 2, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + backend := &mockBackend{} + backend.On("Ping", mock.Anything).Return(tt.backendPingErr) + + logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + handler := NewHandler(backend, logger, time.Now()) + + ctx := context.Background() + response := handler.performHealthChecks(ctx) + + assert.Equal(t, tt.expectedStatus, response.Status) + assert.Equal(t, "dev", response.Version) + assert.Equal(t, "firecracker", response.Backend.Type) + assert.Len(t, response.Checks, tt.expectedChecks) + assert.NotNil(t, response.System) + assert.Contains(t, response.Checks, "backend_ping") + assert.Contains(t, response.Checks, "system_info") + + backend.AssertExpectations(t) + }) + } +} + +func TestHandler_checkBackendHealth(t *testing.T) { + tests := []struct { + name string + backendPingErr error + expectedStatus string + expectError bool + }{ + { + name: "backend healthy", + backendPingErr: nil, + expectedStatus: StatusHealthy, + expectError: false, + }, + { + name: "backend unhealthy", + backendPingErr: errors.New("connection failed"), + expectedStatus: StatusUnhealthy, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + backend := &mockBackend{} + backend.On("Ping", mock.Anything).Return(tt.backendPingErr) + + logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + handler := NewHandler(backend, logger, time.Now()) + + ctx := context.Background() + checks := make(map[string]Check) + + backendHealth := handler.checkBackendHealth(ctx, checks) + + assert.Equal(t, "firecracker", backendHealth.Type) + assert.Equal(t, tt.expectedStatus, backendHealth.Status) + + if tt.expectError { + assert.NotEmpty(t, backendHealth.Error) + assert.Equal(t, tt.backendPingErr.Error(), backendHealth.Error) + } else { + assert.Empty(t, backendHealth.Error) + } + + // Verify check was added + require.Contains(t, checks, "backend_ping") + pingCheck := checks["backend_ping"] + assert.Equal(t, tt.expectedStatus, pingCheck.Status) + + if tt.expectError { + assert.NotEmpty(t, pingCheck.Error) + } else { + assert.Empty(t, pingCheck.Error) + } + + backend.AssertExpectations(t) + }) + } +} + +func TestHandler_checkBackendHealth_Timeout(t *testing.T) { + backend := &mockBackend{} + + // Mock returns context deadline exceeded error + timeoutErr := context.DeadlineExceeded + backend.On("Ping", mock.Anything).Return(timeoutErr) + + logger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + handler := NewHandler(backend, logger, time.Now()) + + ctx := context.Background() + checks := make(map[string]Check) + + backendHealth := handler.checkBackendHealth(ctx, checks) + + assert.Equal(t, "firecracker", backendHealth.Type) + assert.Equal(t, StatusUnhealthy, backendHealth.Status) + assert.Contains(t, backendHealth.Error, "context deadline exceeded") + + require.Contains(t, checks, "backend_ping") + pingCheck := checks["backend_ping"] + assert.Equal(t, StatusUnhealthy, pingCheck.Status) + assert.Contains(t, pingCheck.Error, "context deadline exceeded") + + backend.AssertExpectations(t) +} + +func TestHealthResponse_JSON(t *testing.T) { + response := &HealthResponse{ + Status: StatusHealthy, + Timestamp: time.Now(), + Version: "1.0.0", + Backend: BackendHealth{ + Type: "firecracker", + Status: StatusHealthy, + }, + System: &SystemInfo{ + Hostname: "test-host", + CPU: CPU{ + Architecture: "amd64", + Cores: 4, + Model: "Intel Core i5", + }, + Memory: Memory{ + Total: 8589934592, + Used: 4294967296, + Available: 4294967296, + UsedPct: 50.0, + }, + Uptime: "1h30m", + }, + Checks: map[string]Check{ + "backend_ping": { + Status: StatusHealthy, + Duration: 100 * time.Millisecond, + Timestamp: time.Now(), + }, + }, + } + + jsonData, err := json.Marshal(response) + require.NoError(t, err) + + var unmarshaled HealthResponse + err = json.Unmarshal(jsonData, &unmarshaled) + require.NoError(t, err) + + assert.Equal(t, response.Status, unmarshaled.Status) + assert.Equal(t, response.Version, unmarshaled.Version) + assert.Equal(t, response.Backend.Type, unmarshaled.Backend.Type) + assert.Equal(t, response.Backend.Status, unmarshaled.Backend.Status) + assert.Equal(t, response.System.Hostname, unmarshaled.System.Hostname) + assert.Contains(t, unmarshaled.Checks, "backend_ping") +} + +func TestStatusConstants(t *testing.T) { + assert.Equal(t, "healthy", StatusHealthy) + assert.Equal(t, "unhealthy", StatusUnhealthy) + assert.Equal(t, "degraded", StatusDegraded) +} diff --git a/go/deploy/metald/internal/health/system_test.go b/go/deploy/metald/internal/health/system_test.go new file mode 100644 index 0000000000..04c5f7b193 --- /dev/null +++ b/go/deploy/metald/internal/health/system_test.go @@ -0,0 +1,227 @@ +package health + +import ( + "context" + "os" + "runtime" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGetSystemInfo(t *testing.T) { + startTime := time.Now().Add(-1 * time.Hour) + ctx := context.Background() + + systemInfo, err := GetSystemInfo(ctx, startTime) + require.NoError(t, err) + require.NotNil(t, systemInfo) + + // Test hostname (should not be empty unless there's an error) + assert.NotEmpty(t, systemInfo.Hostname) + + // Test CPU info + assert.Equal(t, runtime.GOARCH, systemInfo.CPU.Architecture) + assert.Equal(t, runtime.NumCPU(), systemInfo.CPU.Cores) + assert.GreaterOrEqual(t, systemInfo.CPU.Cores, 1) + + // Test memory info + assert.Greater(t, systemInfo.Memory.Total, uint64(0)) + assert.GreaterOrEqual(t, systemInfo.Memory.Used, uint64(0)) + assert.GreaterOrEqual(t, systemInfo.Memory.Available, uint64(0)) + assert.GreaterOrEqual(t, systemInfo.Memory.UsedPct, 0.0) + assert.LessOrEqual(t, systemInfo.Memory.UsedPct, 100.0) + + // Test uptime (should be positive duration string) + assert.NotEmpty(t, systemInfo.Uptime) + assert.True(t, strings.Contains(systemInfo.Uptime, "h") || strings.Contains(systemInfo.Uptime, "m") || strings.Contains(systemInfo.Uptime, "s")) +} + +func TestGetSystemInfo_Uptime(t *testing.T) { + // Test with start time 2 hours ago + startTime := time.Now().Add(-2 * time.Hour) + ctx := context.Background() + + systemInfo, err := GetSystemInfo(ctx, startTime) + require.NoError(t, err) + + // Uptime should reflect approximately 2 hours + assert.Contains(t, systemInfo.Uptime, "h") +} + +func TestCPU_Structure(t *testing.T) { + cpu := CPU{ + Architecture: "amd64", + Cores: 8, + Model: "Intel Core i7", + } + + assert.Equal(t, "amd64", cpu.Architecture) + assert.Equal(t, 8, cpu.Cores) + assert.Equal(t, "Intel Core i7", cpu.Model) +} + +func TestMemory_Structure(t *testing.T) { + memory := Memory{ + Total: 8589934592, // 8GB + Used: 2147483648, // 2GB + Available: 6442450944, // 6GB + UsedPct: 25.0, + } + + assert.Equal(t, uint64(8589934592), memory.Total) + assert.Equal(t, uint64(2147483648), memory.Used) + assert.Equal(t, uint64(6442450944), memory.Available) + assert.Equal(t, 25.0, memory.UsedPct) +} + +func TestSystemInfo_Structure(t *testing.T) { + systemInfo := SystemInfo{ + Hostname: "test-host", + CPU: CPU{ + Architecture: "amd64", + Cores: 4, + Model: "Test CPU", + }, + Memory: Memory{ + Total: 4294967296, + Used: 1073741824, + Available: 3221225472, + UsedPct: 25.0, + }, + Uptime: "2h30m15s", + } + + assert.Equal(t, "test-host", systemInfo.Hostname) + assert.Equal(t, "amd64", systemInfo.CPU.Architecture) + assert.Equal(t, 4, systemInfo.CPU.Cores) + assert.Equal(t, "Test CPU", systemInfo.CPU.Model) + assert.Equal(t, uint64(4294967296), systemInfo.Memory.Total) + assert.Equal(t, "2h30m15s", systemInfo.Uptime) +} + +func TestGetCPUModel(t *testing.T) { + // This test is environment-dependent and might not work on all systems + // We'll test the function but accept empty results on non-Linux systems + model := getCPUModel() + + // On Linux systems with /proc/cpuinfo, we might get a model + // On other systems or containers, this might be empty + // Both are valid outcomes + if runtime.GOOS == "linux" { + // Model might be available on Linux, but not guaranteed in containers + t.Logf("CPU Model detected: %q", model) + } else { + // On non-Linux systems, this will likely be empty + assert.Equal(t, "", model) + } +} + +func TestGetMemoryInfo(t *testing.T) { + memory := getMemoryInfo() + + // Basic validation - memory values should be reasonable + assert.Greater(t, memory.Total, uint64(0)) + assert.GreaterOrEqual(t, memory.Used, uint64(0)) + assert.GreaterOrEqual(t, memory.Available, uint64(0)) + assert.GreaterOrEqual(t, memory.UsedPct, 0.0) + assert.LessOrEqual(t, memory.UsedPct, 100.0) + + // Total should be greater than or equal to used + assert.GreaterOrEqual(t, memory.Total, memory.Used) +} + +func TestGetSystemMemory(t *testing.T) { + memory := getSystemMemory() + + // This test is Linux-specific since it reads /proc/meminfo + if runtime.GOOS == "linux" { + if _, err := os.Stat("/proc/meminfo"); err == nil { + // /proc/meminfo exists, we should get some data + assert.Greater(t, memory.Total, uint64(0)) + assert.GreaterOrEqual(t, memory.Used, uint64(0)) + assert.GreaterOrEqual(t, memory.Available, uint64(0)) + assert.GreaterOrEqual(t, memory.UsedPct, 0.0) + assert.LessOrEqual(t, memory.UsedPct, 100.0) + } + } + // On non-Linux systems or when /proc/meminfo is not available, + // the function returns zero values, which is expected behavior +} + +func TestMemoryCalculations(t *testing.T) { + // Test memory percentage calculation logic + tests := []struct { + name string + total uint64 + available uint64 + wantUsed uint64 + wantPct float64 + }{ + { + name: "50% usage", + total: 1000, + available: 500, + wantUsed: 500, + wantPct: 50.0, + }, + { + name: "25% usage", + total: 2000, + available: 1500, + wantUsed: 500, + wantPct: 25.0, + }, + { + name: "0% usage", + total: 1000, + available: 1000, + wantUsed: 0, + wantPct: 0.0, + }, + { + name: "100% usage", + total: 1000, + available: 0, + wantUsed: 1000, + wantPct: 100.0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + used := tt.total - tt.available + usedPct := float64(used) / float64(tt.total) * 100 + + assert.Equal(t, tt.wantUsed, used) + assert.Equal(t, tt.wantPct, usedPct) + }) + } +} + +func TestGetSystemInfo_ErrorHandling(t *testing.T) { + // Test with a very recent start time (basically now) + startTime := time.Now() + ctx := context.Background() + + systemInfo, err := GetSystemInfo(ctx, startTime) + require.NoError(t, err) + require.NotNil(t, systemInfo) + + // Even if hostname fails, the function should not return an error + // It should use "unknown" as fallback + assert.NotEmpty(t, systemInfo.Hostname) + + // CPU info should still be populated from runtime + assert.Equal(t, runtime.GOARCH, systemInfo.CPU.Architecture) + assert.Equal(t, runtime.NumCPU(), systemInfo.CPU.Cores) + + // Memory should be populated (either from system or runtime) + assert.Greater(t, systemInfo.Memory.Total, uint64(0)) + + // Uptime should be very small but formatted as string + assert.NotEmpty(t, systemInfo.Uptime) +} diff --git a/go/deploy/metald/internal/jailer/README.md b/go/deploy/metald/internal/jailer/README.md deleted file mode 100644 index e8b14c1e0d..0000000000 --- a/go/deploy/metald/internal/jailer/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# Integrated Jailer - -## What is this? - -This package implements jailer functionality directly within metald, replacing the need for the external Firecracker jailer binary. - -## Why not use the external jailer? - -The external jailer had a critical issue with our networking setup: -1. It would create the TAP device OUTSIDE the network namespace -2. When Firecracker tried to access it INSIDE the namespace, it would fail with "device not found" -3. This made it impossible to use the external jailer with our network architecture - -## What does the integrated jailer do? - -The integrated jailer provides the same security isolation as the external jailer: -- Creates a chroot jail for each VM -- Drops privileges after setup -- Manages network namespaces -- Creates TAP devices in the correct namespace -- Execs into Firecracker with minimal privileges - -## How is it different? - -The key difference is the order of operations: -1. Fork child process -2. Enter network namespace FIRST -3. Create TAP device (now inside the namespace) -4. Set up chroot -5. Drop privileges -6. Exec Firecracker - -This ensures the TAP device is created where Firecracker expects to find it. - -## Security Implications - -The integrated jailer maintains the same security guarantees: -- Each VM runs in a separate chroot -- Firecracker runs as an unprivileged user -- No privilege escalation is possible -- Network isolation is maintained - -## Required Capabilities - -Metald needs these capabilities (not full root): -- CAP_SYS_ADMIN - For namespace operations -- CAP_NET_ADMIN - For TAP device creation -- CAP_SYS_CHROOT - For chroot operation -- CAP_SETUID/CAP_SETGID - For dropping privileges -- CAP_MKNOD - For device node creation -- CAP_DAC_OVERRIDE - For file access during setup \ No newline at end of file diff --git a/go/deploy/metald/internal/jailer/jailer.go b/go/deploy/metald/internal/jailer/jailer.go index a9f5958639..ff75f6b0e9 100644 --- a/go/deploy/metald/internal/jailer/jailer.go +++ b/go/deploy/metald/internal/jailer/jailer.go @@ -17,10 +17,6 @@ import ( "golang.org/x/sys/unix" ) -// AIDEV-NOTE: This package implements jailer functionality directly in metald -// This allows us to have better control over the network namespace and tap device -// creation, solving the permission issues we encountered with the external jailer - // Jailer provides functionality similar to firecracker's jailer but integrated into metald type Jailer struct { logger *slog.Logger @@ -109,7 +105,6 @@ func (j *Jailer) Exec(ctx context.Context, opts *ExecOptions) error { } // Step 5: Prepare firecracker command - // AIDEV-NOTE: Firecracker binary path is now hardcoded to standard location firecrackerPath := "/usr/local/bin/firecracker" args := []string{firecrackerPath} args = append(args, "--api-sock", opts.SocketPath) @@ -207,7 +202,7 @@ func (j *Jailer) setupChroot(ctx context.Context, chrootPath string) error { // Create necessary directories for _, dir := range []string{"", "dev", "dev/net", "run"} { path := filepath.Join(chrootPath, dir) - if err := os.MkdirAll(path, 0755); err != nil { + if err := os.MkdirAll(path, 0o755); err != nil { return fmt.Errorf("failed to create directory %s: %w", path, err) } } @@ -218,7 +213,7 @@ func (j *Jailer) setupChroot(ctx context.Context, chrootPath string) error { if err != nil { return fmt.Errorf("failed to convert tun device number: %w", err) } - if mkErr := unix.Mknod(tunPath, unix.S_IFCHR|0666, tunDev); mkErr != nil { + if mkErr := unix.Mknod(tunPath, unix.S_IFCHR|0o666, tunDev); mkErr != nil { if !os.IsExist(mkErr) { return fmt.Errorf("failed to create /dev/net/tun: %w", mkErr) } @@ -230,7 +225,7 @@ func (j *Jailer) setupChroot(ctx context.Context, chrootPath string) error { if err != nil { return fmt.Errorf("failed to convert kvm device number: %w", err) } - if err := unix.Mknod(kvmPath, unix.S_IFCHR|0666, kvmDev); err != nil { + if err := unix.Mknod(kvmPath, unix.S_IFCHR|0o666, kvmDev); err != nil { if !os.IsExist(err) { return fmt.Errorf("failed to create /dev/kvm: %w", err) } @@ -238,7 +233,7 @@ func (j *Jailer) setupChroot(ctx context.Context, chrootPath string) error { // Create metrics FIFO for billaged to read Firecracker stats metricsPath := filepath.Join(chrootPath, "metrics.fifo") - if err := unix.Mkfifo(metricsPath, 0644); err != nil && !os.IsExist(err) { + if err := unix.Mkfifo(metricsPath, 0o644); err != nil && !os.IsExist(err) { span.RecordError(err) return fmt.Errorf("failed to create metrics FIFO: %w", err) } @@ -343,16 +338,9 @@ func validateFirecrackerPath(path string) error { } // Check if file is executable - if info.Mode()&0111 == 0 { + if info.Mode()&0o111 == 0 { return fmt.Errorf("firecracker binary is not executable: %s", cleanPath) } return nil } - -// AIDEV-NOTE: This implementation provides the core jailer functionality -// but integrated into metald. The key advantages are: -// 1. We can create tap devices before dropping privileges -// 2. We have full control over the network namespace setup -// 3. We can pass open file descriptors to the jailed process -// 4. We maintain the security isolation of the original jailer diff --git a/go/deploy/metald/internal/jailer/jailer_test.go b/go/deploy/metald/internal/jailer/jailer_test.go index 2079ea8b21..ea74a7fbb3 100644 --- a/go/deploy/metald/internal/jailer/jailer_test.go +++ b/go/deploy/metald/internal/jailer/jailer_test.go @@ -26,7 +26,7 @@ func TestNewJailer(t *testing.T) { } func TestSetupChroot(t *testing.T) { - // This test requires root or CAP_MKNOD to create device nodes + // This test requires root privileges to create device nodes if os.Getuid() != 0 { t.Skip("Test requires root privileges") } @@ -82,7 +82,7 @@ func TestExecOptions(t *testing.T) { } // TestJoinNetworkNamespace tests network namespace joining -// This test requires CAP_SYS_ADMIN to create network namespaces +// This test requires root privileges to create network namespaces func TestJoinNetworkNamespace(t *testing.T) { if os.Getuid() != 0 { t.Skip("Test requires root privileges") diff --git a/go/deploy/metald/internal/network/allocator.go b/go/deploy/metald/internal/network/allocator.go deleted file mode 100644 index 9fbec519bb..0000000000 --- a/go/deploy/metald/internal/network/allocator.go +++ /dev/null @@ -1,180 +0,0 @@ -package network - -import ( - "fmt" - "net" - "sync" -) - -// IPAllocator manages IP address allocation for VMs -type IPAllocator struct { - subnet *net.IPNet - allocated map[string]bool // IP string -> allocated - vmToIP map[string]net.IP // VM ID -> IP - ipToVM map[string]string // IP string -> VM ID - mu sync.Mutex - - // Configuration - startOffset int // Start allocating from subnet + startOffset - endOffset int // Stop allocating at subnet + endOffset -} - -// NewIPAllocator creates a new IP allocator for the given subnet -func NewIPAllocator(subnet *net.IPNet) *IPAllocator { - //exhaustruct:ignore - return &IPAllocator{ - subnet: subnet, - allocated: make(map[string]bool), - vmToIP: make(map[string]net.IP), - ipToVM: make(map[string]string), - startOffset: 2, // Start from .2 (reserve .1 for gateway) - endOffset: 254, // Stop at .254 (reserve .255 for broadcast) - } -} - -// AllocateIP allocates a new IP address -func (a *IPAllocator) AllocateIP() (net.IP, error) { - a.mu.Lock() - defer a.mu.Unlock() - - // For simplicity, we'll work with /24 subnets - // In production, this should handle various subnet sizes - ones, bits := a.subnet.Mask.Size() - if ones > 24 || bits != 32 { - return nil, fmt.Errorf("only /24 or smaller IPv4 subnets supported, got /%d", ones) - } - - baseIP := a.subnet.IP.To4() - if baseIP == nil { - return nil, fmt.Errorf("invalid IPv4 subnet") - } - - // Try to find an available IP - for i := a.startOffset; i <= a.endOffset; i++ { - // Create IP address - ip := make(net.IP, 4) - copy(ip, baseIP) - ip[3] = byte(i) - - // Check if already allocated - if !a.allocated[ip.String()] { - a.allocated[ip.String()] = true - return ip, nil - } - } - - return nil, fmt.Errorf("no available IPs in subnet %s", a.subnet.String()) -} - -// AllocateSpecificIP allocates a specific IP address if available -func (a *IPAllocator) AllocateSpecificIP(ip net.IP) error { - a.mu.Lock() - defer a.mu.Unlock() - - // Check if IP is in our subnet - if !a.subnet.Contains(ip) { - return fmt.Errorf("IP %s not in subnet %s", ip.String(), a.subnet.String()) - } - - // Check if already allocated - if a.allocated[ip.String()] { - return fmt.Errorf("IP %s already allocated", ip.String()) - } - - // Check if it's a reserved IP (.0, .1, .255 for /24) - lastOctet := ip.To4()[3] - if lastOctet == 0 || lastOctet == 1 || lastOctet == 255 { - return fmt.Errorf("IP %s is reserved", ip.String()) - } - - a.allocated[ip.String()] = true - return nil -} - -// ReleaseIP releases an allocated IP address -func (a *IPAllocator) ReleaseIP(ip net.IP) { - a.mu.Lock() - defer a.mu.Unlock() - - delete(a.allocated, ip.String()) - - // Clean up VM mappings if they exist - if vmID, exists := a.ipToVM[ip.String()]; exists { - delete(a.vmToIP, vmID) - delete(a.ipToVM, ip.String()) - } -} - -// AssignIPToVM records the IP-to-VM mapping -func (a *IPAllocator) AssignIPToVM(vmID string, ip net.IP) { - a.mu.Lock() - defer a.mu.Unlock() - - a.vmToIP[vmID] = ip - a.ipToVM[ip.String()] = vmID -} - -// GetVMIP returns the IP assigned to a VM -func (a *IPAllocator) GetVMIP(vmID string) (net.IP, bool) { - a.mu.Lock() - defer a.mu.Unlock() - - ip, exists := a.vmToIP[vmID] - return ip, exists -} - -// GetIPVM returns the VM ID assigned to an IP -func (a *IPAllocator) GetIPVM(ip net.IP) (string, bool) { - a.mu.Lock() - defer a.mu.Unlock() - - vmID, exists := a.ipToVM[ip.String()] - return vmID, exists -} - -// IsAllocated checks if an IP is allocated -func (a *IPAllocator) IsAllocated(ip net.IP) bool { - a.mu.Lock() - defer a.mu.Unlock() - - return a.allocated[ip.String()] -} - -// GetAllocatedCount returns the number of allocated IPs -func (a *IPAllocator) GetAllocatedCount() int { - a.mu.Lock() - defer a.mu.Unlock() - - return len(a.allocated) -} - -// GetAvailableCount returns the number of available IPs -func (a *IPAllocator) GetAvailableCount() int { - total := a.endOffset - a.startOffset + 1 - return total - a.GetAllocatedCount() -} - -// GetAllAllocated returns all allocated IPs -func (a *IPAllocator) GetAllAllocated() []net.IP { - a.mu.Lock() - defer a.mu.Unlock() - - ips := make([]net.IP, 0, len(a.allocated)) - for ipStr := range a.allocated { - if ip := net.ParseIP(ipStr); ip != nil { - ips = append(ips, ip) - } - } - - return ips -} - -// Reset clears all allocations -func (a *IPAllocator) Reset() { - a.mu.Lock() - defer a.mu.Unlock() - - a.allocated = make(map[string]bool) - a.vmToIP = make(map[string]net.IP) - a.ipToVM = make(map[string]string) -} diff --git a/go/deploy/metald/internal/network/bridge.go b/go/deploy/metald/internal/network/bridge.go new file mode 100644 index 0000000000..87b6e76453 --- /dev/null +++ b/go/deploy/metald/internal/network/bridge.go @@ -0,0 +1,63 @@ +package network + +import ( + "fmt" + "log/slog" + + "github.com/vishvananda/netlink" +) + +// ensureBridge creates the bridge if it doesn't exist +func (m *Manager) ensureBridge() error { + if link, err := netlink.LinkByName(m.config.BridgeName); err == nil { + m.logger.Debug("bridge exists", + slog.String("bridge", m.config.BridgeName), + slog.String("type", link.Type()), + slog.String("state", link.Attrs().OperState.String()), + ) + return nil + } + + bridge := &netlink.Bridge{ //nolint:exhaustruct + LinkAttrs: netlink.LinkAttrs{ + Name: m.config.BridgeName, //nolint:exhaustruct + }, + } + + if err := netlink.LinkAdd(bridge); err != nil { + m.logger.Error("failed to create bridge", + slog.String("bridge", m.config.BridgeName), + slog.String("error", err.Error()), + ) + return fmt.Errorf("failed to create bridge: %w", err) + } + + br, err := netlink.LinkByName(m.config.BridgeName) + if err != nil { + return fmt.Errorf("failed to get bridge: %w", err) + } + + addr, err := netlink.ParseAddr(m.config.BaseNetwork.String()) + if err != nil { + return fmt.Errorf("failed to parse bridge IP: %w", err) + } + + if err := netlink.AddrAdd(br, addr); err != nil { + m.logger.Error("failed to add IP to bridge", + slog.String("bridge", m.config.BridgeName), + slog.String("error", err.Error()), + ) + return fmt.Errorf("failed to add IP to bridge: %w", err) + } + + // Bring bridge up + if err := netlink.LinkSetUp(br); err != nil { + m.logger.Error("failed to bring bridge up", + slog.String("bridge", m.config.BridgeName), + slog.String("error", err.Error()), + ) + return fmt.Errorf("failed to bring bridge up: %w", err) + } + + return nil +} diff --git a/go/deploy/metald/internal/network/idgen.go b/go/deploy/metald/internal/network/idgen.go index 63294312cb..065d2a645f 100644 --- a/go/deploy/metald/internal/network/idgen.go +++ b/go/deploy/metald/internal/network/idgen.go @@ -8,7 +8,7 @@ import ( ) // IDGenerator generates short, unique IDs for network devices -// AIDEV-NOTE: Network interface names in Linux are limited to 15 characters, +// Network interface names in Linux are limited to 15 characters, // so we generate 8-character IDs to leave room for prefixes like "tap-", "vh-", etc. type IDGenerator struct { mu sync.Mutex @@ -30,7 +30,7 @@ func (g *IDGenerator) GenerateNetworkID() (string, error) { defer g.mu.Unlock() // Try up to 10 times to generate a unique ID - for i := 0; i < 10; i++ { + for range 10 { // Generate 4 random bytes (8 hex characters) bytes := make([]byte, 4) if _, err := rand.Read(bytes); err != nil { @@ -62,8 +62,7 @@ type NetworkDeviceNames struct { ID string // 8-character internal ID Namespace string // Network namespace name (no length limit) TAP string // TAP device name (15 char limit) - VethHost string // Host-side veth name (15 char limit) - VethNS string // Namespace-side veth name (15 char limit) + Bridge string // Bridge name (15 char limit) } // GenerateDeviceNames creates a consistent set of network device names @@ -72,7 +71,6 @@ func GenerateDeviceNames(networkID string) *NetworkDeviceNames { ID: networkID, Namespace: fmt.Sprintf("ns_vm_%s", networkID), TAP: fmt.Sprintf("tap_%s", networkID), // 12 chars - VethHost: fmt.Sprintf("vh_%s", networkID), // 10 chars - VethNS: fmt.Sprintf("vn_%s", networkID), // 10 chars + Bridge: fmt.Sprintf("br-%s", networkID), } } diff --git a/go/deploy/metald/internal/network/idgen_test.go b/go/deploy/metald/internal/network/idgen_test.go deleted file mode 100644 index 05fd2e2300..0000000000 --- a/go/deploy/metald/internal/network/idgen_test.go +++ /dev/null @@ -1,79 +0,0 @@ -package network - -import ( - "testing" -) - -func TestIDGenerator(t *testing.T) { - gen := NewIDGenerator() - - // Test generating IDs - ids := make(map[string]bool) - for i := 0; i < 100; i++ { - id, err := gen.GenerateNetworkID() - if err != nil { - t.Fatalf("Failed to generate ID: %v", err) - } - - // Check length - if len(id) != 8 { - t.Errorf("Expected ID length 8, got %d", len(id)) - } - - // Check uniqueness - if ids[id] { - t.Errorf("Duplicate ID generated: %s", id) - } - ids[id] = true - } - - // Test release and reuse - firstID, _ := gen.GenerateNetworkID() - gen.ReleaseID(firstID) - - // The same ID could be generated again after release - // (though not guaranteed due to randomness) -} - -func TestGenerateDeviceNames(t *testing.T) { - networkID := "a1b2c3d4" - names := GenerateDeviceNames(networkID) - - tests := []struct { - name string - got string - maxLen int - }{ - {"TAP device", names.TAP, 15}, - {"Veth host", names.VethHost, 15}, - {"Veth NS", names.VethNS, 15}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if len(tt.got) > tt.maxLen { - t.Errorf("%s name too long: %s (%d chars, max %d)", - tt.name, tt.got, len(tt.got), tt.maxLen) - } - - // Check it contains the network ID - if len(tt.got) < len(networkID) { - t.Errorf("%s name doesn't contain full network ID: %s", tt.name, tt.got) - } - }) - } - - // Verify expected formats - if names.TAP != "tap_a1b2c3d4" { - t.Errorf("Expected TAP name 'tap_a1b2c3d4', got %s", names.TAP) - } - if names.VethHost != "vh_a1b2c3d4" { - t.Errorf("Expected VethHost name 'vh_a1b2c3d4', got %s", names.VethHost) - } - if names.VethNS != "vn_a1b2c3d4" { - t.Errorf("Expected VethNS name 'vn_a1b2c3d4', got %s", names.VethNS) - } - if names.Namespace != "ns_vm_a1b2c3d4" { - t.Errorf("Expected Namespace name 'ns_vm_a1b2c3d4', got %s", names.Namespace) - } -} diff --git a/go/deploy/metald/internal/network/implementation.go b/go/deploy/metald/internal/network/implementation.go deleted file mode 100644 index 22510556a5..0000000000 --- a/go/deploy/metald/internal/network/implementation.go +++ /dev/null @@ -1,1668 +0,0 @@ -package network - -import ( - "context" - "fmt" - "hash/fnv" - "log/slog" - "net" - "os" - "os/exec" - "path/filepath" - "strings" - "sync" - "time" - - "github.com/unkeyed/unkey/go/deploy/metald/internal/config" - "github.com/vishvananda/netlink" - "github.com/vishvananda/netns" -) - -// Config holds network configuration -type Config struct { - BridgeName string // Default: "br-vms" - BridgeIP string // Default: "172.31.0.1/19" - VMSubnet string // Default: "172.31.0.0/19" - EnableIPv6 bool - DNSServers []string // Default: ["8.8.8.8", "8.8.4.4"] - EnableRateLimit bool - RateLimitMbps int // Per VM rate limit in Mbps - - // Port allocation configuration - PortRangeMin int // Default: 32768 - PortRangeMax int // Default: 65535 -} - -// DefaultConfig returns default network configuration -func DefaultConfig() *Config { - return &Config{ //nolint:exhaustruct // EnableIPv6 field uses zero value (false) which is appropriate for default config - BridgeName: "br-vms", - BridgeIP: "172.31.0.1/19", - VMSubnet: "172.31.0.0/19", - DNSServers: []string{"8.8.8.8", "8.8.4.4"}, - EnableRateLimit: true, - RateLimitMbps: 100, // 100 Mbps default - PortRangeMin: 32768, // Ephemeral port range start - PortRangeMax: 65535, // Ephemeral port range end - } -} - -// Manager handles VM networking -type Manager struct { - logger *slog.Logger - config *Config - allocator *IPAllocator - portAllocator *PortAllocator - idGen *IDGenerator - mu sync.RWMutex - vmNetworks map[string]*VMNetwork - - // Runtime state - hostProtection *HostProtection - metrics *NetworkMetrics - bridgeCreated bool - iptablesRules []string -} - -// NewManager creates a new network manager -func NewManager(logger *slog.Logger, netConfig *Config, mainConfig *config.NetworkConfig) (*Manager, error) { - if netConfig == nil { - netConfig = DefaultConfig() - } - - logger = logger.With("component", "network-manager") - logger.Info("creating network manager", - slog.String("bridge_name", netConfig.BridgeName), - slog.String("bridge_ip", netConfig.BridgeIP), - slog.String("vm_subnet", netConfig.VMSubnet), - slog.Bool("host_protection", mainConfig.EnableHostProtection), - ) - - _, subnet, err := net.ParseCIDR(netConfig.VMSubnet) - if err != nil { - return nil, fmt.Errorf("invalid subnet: %w", err) - } - - // Initialize network metrics - networkMetrics, err := NewNetworkMetrics(logger) - if err != nil { - return nil, fmt.Errorf("failed to create network metrics: %w", err) - } - - m := &Manager{ //nolint:exhaustruct // mu, bridgeCreated, and iptablesRules fields use appropriate zero values - logger: logger, - config: netConfig, - allocator: NewIPAllocator(subnet), - portAllocator: NewPortAllocator(netConfig.PortRangeMin, netConfig.PortRangeMax), - idGen: NewIDGenerator(), - hostProtection: NewHostProtection(logger, mainConfig), - metrics: networkMetrics, - vmNetworks: make(map[string]*VMNetwork), - } - - // Set bridge max VMs based on configuration - m.metrics.SetBridgeMaxVMs(netConfig.BridgeName, int64(mainConfig.MaxVMsPerBridge)) - - // Log current network state before initialization - m.logNetworkState("before initialization") - - // Initialize host networking - if err := m.initializeHost(); err != nil { - m.logger.Error("failed to initialize host networking", - slog.String("error", err.Error()), - ) - m.logNetworkState("after failed initialization") - return nil, fmt.Errorf("failed to initialize host networking: %w", err) - } - - // Start host protection system - ctx := context.Background() // Use background context for initialization - if err := m.hostProtection.Start(ctx); err != nil { - m.logger.Warn("failed to start host protection", - slog.String("error", err.Error()), - ) - // Don't fail completely - host protection is optional - } - - // Log network state after initialization - m.logNetworkState("after successful initialization") - - return m, nil -} - -// initializeHost sets up the host networking infrastructure -func (m *Manager) initializeHost() error { - m.logger.Info("starting host network initialization") - - // Enable IP forwarding using sysctl (now running as root) - m.logger.Info("enabling IP forwarding") - cmd := exec.Command("sysctl", "-w", "net.ipv4.ip_forward=1") - if output, err := cmd.CombinedOutput(); err != nil { - m.logger.Error("failed to enable IP forwarding", - slog.String("error", err.Error()), - slog.String("output", string(output)), - ) - return fmt.Errorf("failed to enable IP forwarding: %w", err) - } - - // Make it persistent across reboots - // AIDEV-NOTE: Creates sysctl config to persist IP forwarding - sysctlConfig := []byte("# Enable IP forwarding for metald VM networking\nnet.ipv4.ip_forward = 1\n") - sysctlPath := "/etc/sysctl.d/99-metald.conf" - - if err := os.WriteFile(sysctlPath, sysctlConfig, 0600); err != nil { - m.logger.Warn("failed to create persistent sysctl config", - slog.String("path", sysctlPath), - slog.String("error", err.Error()), - ) - // Not fatal - IP forwarding is enabled for this session - } - - m.logger.Info("IP forwarding enabled successfully") - - // Create bridge if it doesn't exist - if err := m.ensureBridge(); err != nil { - return fmt.Errorf("failed to create bridge: %w", err) - } - - // Setup NAT rules (best effort - may fail without root or if already configured) - m.logNetworkState("before NAT setup") - if err := m.setupNAT(); err != nil { - m.logger.Warn("failed to setup NAT (may already be configured)", - slog.String("error", err.Error()), - ) - m.logNetworkState("after failed NAT setup") - // Continue anyway - NAT might already be set up - } else { - m.logNetworkState("after successful NAT setup") - } - - m.logger.Info("host networking initialized", - slog.String("bridge", m.config.BridgeName), - slog.String("subnet", m.config.VMSubnet), - ) - - return nil -} - -// ensureBridge creates the bridge if it doesn't exist -func (m *Manager) ensureBridge() error { - m.logger.Info("checking if bridge exists", - slog.String("bridge", m.config.BridgeName), - ) - - // Check if bridge exists - if link, err := netlink.LinkByName(m.config.BridgeName); err == nil { - m.bridgeCreated = true - m.logger.Info("bridge already exists", - slog.String("bridge", m.config.BridgeName), - slog.String("type", link.Type()), - slog.String("state", link.Attrs().OperState.String()), - ) - return nil // Bridge already exists - } else { - m.logger.Info("bridge does not exist, will create", - slog.String("bridge", m.config.BridgeName), - slog.String("error", err.Error()), - ) - } - - // Create bridge - m.logger.Info("creating new bridge", - slog.String("bridge", m.config.BridgeName), - ) - - bridge := &netlink.Bridge{ //nolint:exhaustruct // Only setting Name field, other bridge fields use appropriate defaults - LinkAttrs: netlink.LinkAttrs{ //nolint:exhaustruct // Only setting Name field, other link attributes use appropriate defaults - Name: m.config.BridgeName, - }, - } - - m.logger.Info("CRITICAL: About to create bridge - network may be affected", - slog.String("bridge", m.config.BridgeName), - ) - - if err := netlink.LinkAdd(bridge); err != nil { - m.logger.Error("failed to create bridge", - slog.String("bridge", m.config.BridgeName), - slog.String("error", err.Error()), - ) - m.logNetworkState("after failed bridge creation") - return fmt.Errorf("failed to create bridge: %w", err) - } - m.logger.Info("bridge created successfully - checking network state", - slog.String("bridge", m.config.BridgeName), - ) - m.logNetworkState("immediately after bridge creation") - - // Get the created bridge - br, err := netlink.LinkByName(m.config.BridgeName) - if err != nil { - return fmt.Errorf("failed to get bridge: %w", err) - } - - // Add IP address to bridge - m.logger.Info("parsing bridge IP address", - slog.String("ip", m.config.BridgeIP), - ) - addr, err := netlink.ParseAddr(m.config.BridgeIP) - if err != nil { - return fmt.Errorf("failed to parse bridge IP: %w", err) - } - - m.logger.Info("adding IP address to bridge", - slog.String("bridge", m.config.BridgeName), - slog.String("ip", m.config.BridgeIP), - ) - if err := netlink.AddrAdd(br, addr); err != nil { - m.logger.Error("failed to add IP to bridge", - slog.String("bridge", m.config.BridgeName), - slog.String("ip", m.config.BridgeIP), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to add IP to bridge: %w", err) - } - m.logger.Info("IP address added to bridge successfully") - - // Bring bridge up - m.logger.Info("bringing bridge up", - slog.String("bridge", m.config.BridgeName), - ) - if err := netlink.LinkSetUp(br); err != nil { - m.logger.Error("failed to bring bridge up", - slog.String("bridge", m.config.BridgeName), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to bring bridge up: %w", err) - } - m.logger.Info("bridge is now up", - slog.String("bridge", m.config.BridgeName), - ) - - m.bridgeCreated = true - return nil -} - -// setupNAT configures iptables NAT rules -func (m *Manager) setupNAT() error { - m.logger.Info("setting up NAT rules") - - // Get the default route interface - m.logger.Info("listing routes to find default interface") - routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) - if err != nil { - m.logger.Error("failed to list routes", - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to list routes: %w", err) - } - m.logger.Info("found routes", - slog.Int("count", len(routes)), - ) - - var defaultIface string - for _, route := range routes { - if route.Dst == nil { // Default route - m.logger.Info("found default route", - slog.Int("link_index", route.LinkIndex), - ) - link, err := netlink.LinkByIndex(route.LinkIndex) - if err == nil { - defaultIface = link.Attrs().Name - m.logger.Info("identified default interface", - slog.String("interface", defaultIface), - slog.String("type", link.Type()), - slog.String("state", link.Attrs().OperState.String()), - ) - break - } else { - m.logger.Warn("failed to get link for default route", - slog.Int("link_index", route.LinkIndex), - slog.String("error", err.Error()), - ) - } - } - } - - if defaultIface == "" { - m.logger.Error("could not find default route interface", - slog.Int("routes_checked", len(routes)), - ) - return fmt.Errorf("could not find default route interface") - } - - // Setup NAT rules - rules := [][]string{ - // Enable NAT for VM subnet - {"-t", "nat", "-A", "POSTROUTING", "-s", m.config.VMSubnet, "-o", defaultIface, "-j", "MASQUERADE"}, - - // Allow forwarding from bridge to external - {"-A", "FORWARD", "-i", m.config.BridgeName, "-o", defaultIface, "-j", "ACCEPT"}, - - // Allow established connections back - {"-A", "FORWARD", "-i", defaultIface, "-o", m.config.BridgeName, "-m", "state", "--state", "RELATED,ESTABLISHED", "-j", "ACCEPT"}, - - // Allow VM to VM communication - {"-A", "FORWARD", "-i", m.config.BridgeName, "-o", m.config.BridgeName, "-j", "ACCEPT"}, - } - - for i, rule := range rules { - ruleStr := strings.Join(rule, " ") - m.logger.Info("adding iptables rule", - slog.Int("rule_number", i+1), - slog.String("rule", ruleStr), - ) - - cmd := exec.Command("iptables", rule...) - if output, err := cmd.CombinedOutput(); err != nil { - m.logger.Error("failed to add iptables rule", - slog.String("rule", ruleStr), - slog.String("error", err.Error()), - slog.String("output", string(output)), - ) - // Try to clean up on failure - m.cleanupIPTables() - return fmt.Errorf("failed to add iptables rule %v: %w", rule, err) - } - m.logger.Info("iptables rule added successfully", - slog.String("rule", ruleStr), - ) - m.iptablesRules = append(m.iptablesRules, ruleStr) - } - - return nil -} - -// CreateVMNetwork sets up networking for a VM -func (m *Manager) CreateVMNetwork(ctx context.Context, vmID string) (*VMNetwork, error) { - // Default namespace name - will be overridden in CreateVMNetworkWithNamespace - // if empty to use consistent device naming - return m.CreateVMNetworkWithNamespace(ctx, vmID, "") -} - -// CreateVMNetworkWithNamespace sets up networking for a VM with a specific namespace name -func (m *Manager) CreateVMNetworkWithNamespace(ctx context.Context, vmID, nsName string) (*VMNetwork, error) { - startTime := time.Now() - - m.logger.InfoContext(ctx, "creating VM network", - slog.String("vm_id", vmID), - slog.String("namespace", nsName), - ) - m.logNetworkState("before VM network creation") - - m.mu.Lock() - defer m.mu.Unlock() - - // Check if network already exists - if existing, exists := m.vmNetworks[vmID]; exists { - m.logger.WarnContext(ctx, "VM network already exists", - slog.String("vm_id", vmID), - slog.String("ip", existing.IPAddress.String()), - ) - return existing, nil - } - - // Generate internal network ID for device naming - // AIDEV-NOTE: This ensures consistent naming across all network devices - networkID, err := m.idGen.GenerateNetworkID() - if err != nil { - m.metrics.RecordVMNetworkCreate(ctx, m.config.BridgeName, false) - m.metrics.RecordNetworkSetupDuration(ctx, time.Since(startTime), m.config.BridgeName, false) - return nil, fmt.Errorf("failed to generate network ID: %w", err) - } - - // Generate device names using consistent naming convention - deviceNames := GenerateDeviceNames(networkID) - - // Allocate IP address - ip, err := m.allocator.AllocateIP() - if err != nil { - m.idGen.ReleaseID(networkID) - m.metrics.RecordVMNetworkCreate(ctx, m.config.BridgeName, false) - m.metrics.RecordNetworkSetupDuration(ctx, time.Since(startTime), m.config.BridgeName, false) - return nil, fmt.Errorf("failed to allocate IP: %w", err) - } - - // Generate MAC address - mac := m.generateMAC(vmID) - - // Override namespace name if provided (e.g., by jailer) - // AIDEV-NOTE: CRITICAL FIX - Use deviceNames.Namespace when nsName is empty to ensure - // namespace name matches the veth device names (vn_{networkID}). This prevents - // "no such device" errors when configuring veth inside the namespace. - actualNsName := nsName - if actualNsName == "" { - actualNsName = deviceNames.Namespace - } - - // Create network namespace if it doesn't exist - // It might have been pre-created by the jailer - if err := m.createNamespace(actualNsName); err != nil { - m.allocator.ReleaseIP(ip) - m.idGen.ReleaseID(networkID) - return nil, fmt.Errorf("failed to create namespace: %w", err) - } - - // Create TAP device and configure networking - if err := m.setupVMNetworking(actualNsName, deviceNames, ip, mac); err != nil { - m.allocator.ReleaseIP(ip) - m.idGen.ReleaseID(networkID) - m.deleteNamespace(actualNsName) - return nil, fmt.Errorf("failed to setup VM networking: %w", err) - } - - // Create VM network info - _, subnet, _ := net.ParseCIDR(m.config.VMSubnet) - gateway := make(net.IP, len(subnet.IP)) - copy(gateway, subnet.IP) - gateway[len(gateway)-1] = 1 - - vmNet := &VMNetwork{ //nolint:exhaustruct // VLANID, IPv6Address, and Routes fields use appropriate zero values - VMID: vmID, - NetworkID: networkID, - Namespace: actualNsName, - TapDevice: deviceNames.TAP, - IPAddress: ip, - Netmask: net.IPv4Mask(255, 255, 0, 0), // /16 to match subnet - Gateway: gateway, - MacAddress: mac, - DNSServers: m.config.DNSServers, - CreatedAt: time.Now(), - } - - m.vmNetworks[vmID] = vmNet - - // Record successful network creation metrics - duration := time.Since(startTime) - m.metrics.RecordVMNetworkCreate(ctx, m.config.BridgeName, true) - m.metrics.RecordNetworkSetupDuration(ctx, duration, m.config.BridgeName, true) - - m.logger.InfoContext(ctx, "created VM network", - slog.String("vm_id", vmID), - slog.String("ip", ip.String()), - slog.String("mac", mac), - slog.String("tap", deviceNames.TAP), - slog.String("namespace", actualNsName), - slog.String("network_id", networkID), - slog.Duration("setup_duration", duration), - ) - - return vmNet, nil -} - -// setupVMNetworking configures the network namespace and TAP device -func (m *Manager) setupVMNetworking(nsName string, deviceNames *NetworkDeviceNames, ip net.IP, mac string) error { - // AIDEV-NOTE: Now running as root, no need for nsenter workarounds - - // Use device names from the consistent naming convention - vethHost := deviceNames.VethHost - vethNS := deviceNames.VethNS - - // Create veth pair using netlink (preferred when running as root) - veth := &netlink.Veth{ //nolint:exhaustruct // Only setting required fields, other veth fields use appropriate defaults - LinkAttrs: netlink.LinkAttrs{Name: vethHost}, //nolint:exhaustruct // Only setting Name field, other link attributes use appropriate defaults - PeerName: vethNS, - } - - m.logger.Info("creating veth pair", - slog.String("host_end", vethHost), - slog.String("ns_end", vethNS), - slog.String("namespace", nsName), - slog.Time("timestamp", time.Now()), - ) - - if err := netlink.LinkAdd(veth); err != nil { - m.logger.Error("failed to create veth pair", - slog.String("host_end", vethHost), - slog.String("ns_end", vethNS), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to create veth pair: %w", err) - } - - m.logger.Info("veth pair created successfully", - slog.String("host_end", vethHost), - slog.String("ns_end", vethNS), - slog.Time("timestamp", time.Now()), - ) - - // AIDEV-NOTE: Ensure cleanup on any error after veth creation - cleanupVeth := true - defer func() { - if cleanupVeth { - if link, err := netlink.LinkByName(vethHost); err == nil { - if delErr := netlink.LinkDel(link); delErr != nil { - m.logger.Warn("Failed to cleanup veth pair on error", "device", vethHost, "error", delErr) - } - } - } - }() - - // Get the namespace - ns, err := netns.GetFromName(nsName) - if err != nil { - // Clean up veth pair - if vethLink, err2 := netlink.LinkByName(vethHost); err2 == nil { - if delErr := netlink.LinkDel(vethLink); delErr != nil { - m.logger.Warn("Failed to cleanup veth link", "link", vethHost, "error", delErr) - } - } - return fmt.Errorf("failed to get namespace: %w", err) - } - defer ns.Close() - - // Move veth peer to namespace - // Sometimes the link takes a moment to appear, retry a few times - m.logger.Info("looking for veth peer to move to namespace", - slog.String("device", vethNS), - slog.Time("timestamp", time.Now()), - ) - - var vethNSLink netlink.Link - for i := 0; i < 3; i++ { - vethNSLink, err = netlink.LinkByName(vethNS) - if err == nil { - m.logger.Info("found veth peer", - slog.String("device", vethNS), - slog.Int("attempt", i+1), - slog.Time("timestamp", time.Now()), - ) - break - } - m.logger.Warn("veth peer not found, retrying", - slog.String("device", vethNS), - slog.Int("attempt", i+1), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - if i < 2 { - time.Sleep(100 * time.Millisecond) - } - } - if err != nil { - // Clean up veth pair - if vethLink, err2 := netlink.LinkByName(vethHost); err2 == nil { - if delErr := netlink.LinkDel(vethLink); delErr != nil { - m.logger.Warn("Failed to cleanup veth link", "link", vethHost, "error", delErr) - } - } - return fmt.Errorf("failed to get veth peer %s: %w", vethNS, err) - } - - // Check if both veth ends exist before moving - hostLink, err := netlink.LinkByName(vethHost) - if err != nil { - m.logger.Error("veth host side missing before move", - slog.String("device", vethHost), - slog.String("error", err.Error()), - ) - return fmt.Errorf("veth host side missing: %w", err) - } - m.logger.Debug("veth host side exists before move", - slog.String("device", vethHost), - slog.Int("index", hostLink.Attrs().Index), - ) - - m.logger.Info("moving veth to namespace", - slog.String("device", vethNS), - slog.String("namespace", nsName), - slog.Time("timestamp", time.Now()), - ) - - if err := netlink.LinkSetNsFd(vethNSLink, int(ns)); err != nil { - m.logger.Error("failed to move veth to namespace", - slog.String("device", vethNS), - slog.String("namespace", nsName), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - // Clean up veth pair - if vethLink, err2 := netlink.LinkByName(vethHost); err2 == nil { - if delErr := netlink.LinkDel(vethLink); delErr != nil { - m.logger.Warn("Failed to cleanup veth link", "link", vethHost, "error", delErr) - } - } - return fmt.Errorf("failed to move veth to namespace: %w", err) - } - - m.logger.Info("veth moved to namespace successfully", - slog.String("device", vethNS), - slog.String("namespace", nsName), - slog.Time("timestamp", time.Now()), - ) - - // Check if host side still exists after move - if _, err := netlink.LinkByName(vethHost); err != nil { - m.logger.Error("veth host side disappeared after moving peer to namespace!", - slog.String("device", vethHost), - slog.String("error", err.Error()), - ) - // List all interfaces to debug - links, _ := netlink.LinkList() - linkNames := make([]string, 0, len(links)) - for _, link := range links { - linkNames = append(linkNames, link.Attrs().Name) - } - m.logger.Error("available interfaces after move", - slog.Any("interfaces", linkNames), - ) - return fmt.Errorf("veth host side disappeared: %w", err) - } - - // Attach host end to bridge - m.logger.Info("attaching veth to bridge", - slog.String("veth", vethHost), - slog.String("bridge", m.config.BridgeName), - slog.Time("timestamp", time.Now()), - ) - - // List all interfaces before trying to get veth host - beforeLinks, _ := netlink.LinkList() - beforeNames := make([]string, 0, len(beforeLinks)) - for _, link := range beforeLinks { - beforeNames = append(beforeNames, link.Attrs().Name) - } - m.logger.Debug("interfaces before getting veth host", - slog.Any("interfaces", beforeNames), - ) - - vethHostLink, err2 := netlink.LinkByName(vethHost) - if err2 != nil { - m.logger.Error("failed to get veth host", - slog.String("device", vethHost), - slog.String("error", err2.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to get veth host: %w", err2) - } - - bridge, err2 := netlink.LinkByName(m.config.BridgeName) - if err2 != nil { - m.logger.Error("failed to get bridge", - slog.String("bridge", m.config.BridgeName), - slog.String("error", err2.Error()), - slog.Time("timestamp", time.Now()), - ) - // List all links to debug - links, _ := netlink.LinkList() - linkNames := make([]string, 0, len(links)) - for _, link := range links { - linkNames = append(linkNames, link.Attrs().Name) - } - m.logger.Error("available network interfaces", - slog.Any("interfaces", linkNames), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to get bridge: %w", err2) - } - - if err2 := netlink.LinkSetMaster(vethHostLink, bridge); err2 != nil { - m.logger.Error("failed to attach veth to bridge", - slog.String("veth", vethHost), - slog.String("bridge", m.config.BridgeName), - slog.String("error", err2.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to attach veth to bridge: %w", err2) - } - - m.logger.Info("veth attached to bridge successfully", - slog.String("veth", vethHost), - slog.String("bridge", m.config.BridgeName), - slog.Time("timestamp", time.Now()), - ) - - // Bring up the veth host interface - if err := netlink.LinkSetUp(vethHostLink); err != nil { - return fmt.Errorf("failed to bring up veth host: %w", err) - } - - // Create TAP device in host namespace (so firecracker can access it) - if err := m.createTAPDevice(deviceNames.TAP, mac); err != nil { - return fmt.Errorf("failed to create TAP device: %w", err) - } - - // Configure inside namespace - if err := m.configureNamespace(ns, vethNS, ip); err != nil { - return err - } - - // Success - don't cleanup veth - cleanupVeth = false - return nil -} - -// createTAPDevice creates a TAP device in the host namespace -func (m *Manager) createTAPDevice(tapName, mac string) error { - // Create TAP device - tap := &netlink.Tuntap{ //nolint:exhaustruct // Only setting required fields, other tap fields use appropriate defaults - LinkAttrs: netlink.LinkAttrs{ //nolint:exhaustruct // Only setting Name field, other link attributes use appropriate defaults - Name: tapName, - }, - Mode: netlink.TUNTAP_MODE_TAP, - } - - m.logger.Info("creating TAP device", - slog.String("tap", tapName), - slog.Time("timestamp", time.Now()), - ) - - if err := netlink.LinkAdd(tap); err != nil { - m.logger.Error("failed to create tap device", - slog.String("tap", tapName), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to create tap device: %w", err) - } - - m.logger.Info("TAP device created successfully", - slog.String("tap", tapName), - slog.Time("timestamp", time.Now()), - ) - - // Set MAC address on TAP - m.logger.Info("getting tap link to set MAC", - slog.String("tap", tapName), - slog.Time("timestamp", time.Now()), - ) - - tapLink, err := netlink.LinkByName(tapName) - if err != nil { - m.logger.Error("failed to get tap link", - slog.String("tap", tapName), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to get tap link: %w", err) - } - - hwAddr, _ := net.ParseMAC(mac) - m.logger.Info("setting MAC on tap device", - slog.String("tap", tapName), - slog.String("mac", mac), - slog.Time("timestamp", time.Now()), - ) - - if err := netlink.LinkSetHardwareAddr(tapLink, hwAddr); err != nil { - m.logger.Error("failed to set MAC on tap device", - slog.String("tap", tapName), - slog.String("mac", mac), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to set MAC on tap device: %w", err) - } - - // Bring TAP device up - if err := netlink.LinkSetUp(tapLink); err != nil { - m.logger.Error("failed to bring up tap device", - slog.String("tap", tapName), - slog.String("error", err.Error()), - slog.Time("timestamp", time.Now()), - ) - return fmt.Errorf("failed to bring up tap device: %w", err) - } - - m.logger.Info("TAP device configured successfully", - slog.String("tap", tapName), - slog.String("mac", mac), - slog.Time("timestamp", time.Now()), - ) - - return nil -} - -// configureNamespace sets up networking inside the namespace (veth only) -func (m *Manager) configureNamespace(ns netns.NsHandle, vethName string, ip net.IP) error { - // Save current namespace - origNS, err := netns.Get() - if err != nil { - return fmt.Errorf("failed to get current namespace: %w", err) - } - defer origNS.Close() - - // Switch to target namespace - if setErr := netns.Set(ns); setErr != nil { - return fmt.Errorf("failed to set namespace: %w", setErr) - } - defer func() { - if setErr := netns.Set(origNS); setErr != nil { - slog.Error("Failed to restore namespace", "error", setErr) - } - }() - - // Get veth link - vethLink, err := netlink.LinkByName(vethName) - if err != nil { - return fmt.Errorf("failed to get veth link: %w", err) - } - - // AIDEV-NOTE: Simplified networking - no bridge needed inside namespace - // The veth device will handle routing between host and VM - // The TAP device is created in the host namespace for firecracker access - - // Bring up veth interface - if err := netlink.LinkSetUp(vethLink); err != nil { - return fmt.Errorf("failed to bring up veth: %w", err) - } - - // Add IP directly to veth interface - // AIDEV-NOTE: The veth acts as the default gateway for the VM - // Using /16 to match the host bridge subnet - addr := &netlink.Addr{ //nolint:exhaustruct // Only setting IPNet field, other address fields use appropriate defaults - IPNet: &net.IPNet{ - IP: ip, - Mask: net.CIDRMask(16, 32), // Use /16 to match the bridge subnet - }, - } - - // AIDEV-NOTE: Retry adding IP to handle race conditions where veth might not be immediately ready - var addErr error - for i := 0; i < 5; i++ { - addErr = netlink.AddrAdd(vethLink, addr) - if addErr == nil { - break - } - - // Check if it's a "no such device" error specifically - if strings.Contains(addErr.Error(), "no such device") { - m.logger.Warn("veth device not ready for IP assignment, retrying", - slog.String("veth", vethName), - slog.Int("attempt", i+1), - slog.String("error", addErr.Error()), - ) - time.Sleep(50 * time.Millisecond) - - // Re-get the veth link in the current namespace context (we're already in the target namespace) - vethLink, err = netlink.LinkByName(vethName) - if err != nil { - // Log available interfaces for debugging - if links, listErr := netlink.LinkList(); listErr == nil { - var linkNames []string - for _, link := range links { - linkNames = append(linkNames, link.Attrs().Name) - } - m.logger.Error("available interfaces in namespace during retry", - slog.String("veth", vethName), - slog.Int("attempt", i+1), - slog.Any("interfaces", linkNames), - ) - } - return fmt.Errorf("failed to re-get veth link on retry %d: %w", i+1, err) - } - continue - } - - // For other errors, don't retry - break - } - - if addErr != nil { - return fmt.Errorf("failed to add IP to veth after retries: %w", addErr) - } - - // Enable proxy ARP on veth so it responds to ARP requests for the VM - // AIDEV-NOTE: This is necessary when not using a bridge - proxyARPPath := fmt.Sprintf("/proc/sys/net/ipv4/conf/%s/proxy_arp", vethName) - if err := os.WriteFile(proxyARPPath, []byte("1\n"), 0600); err != nil { - m.logger.Warn("failed to enable proxy ARP on veth", - slog.String("veth", vethName), - slog.String("error", err.Error()), - ) - // Non-fatal - continue anyway - } - - // Add default route - _, subnet, _ := net.ParseCIDR(m.config.VMSubnet) - gateway := make(net.IP, len(subnet.IP)) - copy(gateway, subnet.IP) - gateway[len(gateway)-1] = 1 - - route := &netlink.Route{ //nolint:exhaustruct // Only setting Dst and Gw fields for default route, other route fields use appropriate defaults - Dst: nil, // default route - Gw: gateway, - } - if err := netlink.RouteAdd(route); err != nil && !strings.Contains(err.Error(), "exists") { - return fmt.Errorf("failed to add default route: %w", err) - } - - return nil -} - -// applyRateLimit applies traffic shaping to the interface -// -//nolint:unused // Reserved for future rate limiting implementation -func (m *Manager) applyRateLimit(link netlink.Link, mbps int) { - // Use tc (traffic control) to limit bandwidth - // This is a simplified example - production would use netlink directly - - // Validate interface name to prevent command injection - ifaceName := link.Attrs().Name - if !isValidInterfaceName(ifaceName) { - m.logger.Error("invalid interface name", - slog.String("interface", ifaceName), - ) - return - } - - // Delete any existing qdisc (ignore errors as it might not exist) - _ = exec.Command("tc", "qdisc", "del", "dev", ifaceName, "root").Run() //nolint:gosec // Interface name validated - - // Add HTB qdisc - cmd := exec.Command("tc", "qdisc", "add", "dev", ifaceName, "root", "handle", "1:", "htb") //nolint:gosec // Interface name validated - if err := cmd.Run(); err != nil { - m.logger.Warn("failed to add HTB qdisc", - slog.String("interface", ifaceName), - slog.String("error", err.Error()), - ) - return // Non-fatal - } - - // Add rate limit class - rate := fmt.Sprintf("%dmbit", mbps) - cmd = exec.Command("tc", "class", "add", "dev", ifaceName, - "parent", "1:", "classid", "1:1", "htb", "rate", rate) //nolint:gosec // Interface name validated - if err := cmd.Run(); err != nil { - m.logger.Warn("failed to add rate limit", - slog.String("interface", ifaceName), - slog.String("error", err.Error()), - ) - } -} - -// DeleteVMNetwork removes networking for a VM -func (m *Manager) DeleteVMNetwork(ctx context.Context, vmID string) error { - startTime := time.Now() - - m.logger.InfoContext(ctx, "deleting VM network", - slog.String("vm_id", vmID), - ) - - m.mu.Lock() - defer m.mu.Unlock() - - vmNet, exists := m.vmNetworks[vmID] - if !exists { - m.logger.InfoContext(ctx, "VM network already deleted", - slog.String("vm_id", vmID), - ) - return nil // Already deleted - } - - // Release IP - m.allocator.ReleaseIP(vmNet.IPAddress) - - // Delete network namespace FIRST - // AIDEV-NOTE: Deleting namespace automatically cleans up all interfaces inside it - // This prevents issues with trying to delete interfaces that no longer exist - m.deleteNamespace(vmNet.Namespace) - - // Delete veth pair (if it still exists on host) - // AIDEV-NOTE: After namespace deletion, only the host side of veth pair remains - deviceNames := GenerateDeviceNames(vmNet.NetworkID) - if link, err := netlink.LinkByName(deviceNames.VethHost); err == nil { - if delErr := netlink.LinkDel(link); delErr != nil { - m.logger.WarnContext(ctx, "Failed to delete veth pair", "device", deviceNames.VethHost, "error", delErr) - } else { - m.logger.InfoContext(ctx, "Deleted veth pair", "device", deviceNames.VethHost) - } - } - - // AIDEV-NOTE: Delete TAP device (CRITICAL FIX - this was missing!) - // TAP devices are created in host namespace for Firecracker access and must be explicitly cleaned up - if link, err := netlink.LinkByName(deviceNames.TAP); err == nil { - if delErr := netlink.LinkDel(link); delErr != nil { - m.logger.WarnContext(ctx, "Failed to delete TAP device", - "device", deviceNames.TAP, "error", delErr) - } else { - m.logger.InfoContext(ctx, "Deleted TAP device", "device", deviceNames.TAP) - } - } - - // Verify cleanup completed successfully - if err := m.verifyNetworkCleanup(ctx, vmID, deviceNames); err != nil { - m.logger.WarnContext(ctx, "Network cleanup verification failed", - "vm_id", vmID, "error", err) - } - - // Release the network ID for reuse - m.idGen.ReleaseID(vmNet.NetworkID) - - delete(m.vmNetworks, vmID) - - // Record successful network deletion metrics - duration := time.Since(startTime) - m.metrics.RecordVMNetworkDelete(ctx, m.config.BridgeName, true) - m.metrics.RecordNetworkCleanupDuration(ctx, duration, m.config.BridgeName, true) - - m.logger.InfoContext(ctx, "deleted VM network", - slog.String("vm_id", vmID), - slog.String("network_id", vmNet.NetworkID), - slog.String("ip", vmNet.IPAddress.String()), - slog.Duration("cleanup_duration", duration), - ) - - return nil -} - -// verifyNetworkCleanup verifies that all network resources for a VM have been properly cleaned up -func (m *Manager) verifyNetworkCleanup(ctx context.Context, vmID string, deviceNames *NetworkDeviceNames) error { - var remainingResources []string - - // Check if TAP device still exists - if _, err := netlink.LinkByName(deviceNames.TAP); err == nil { - remainingResources = append(remainingResources, fmt.Sprintf("TAP device: %s", deviceNames.TAP)) - } - - // Check if veth host device still exists - if _, err := netlink.LinkByName(deviceNames.VethHost); err == nil { - remainingResources = append(remainingResources, fmt.Sprintf("veth device: %s", deviceNames.VethHost)) - } - - // Check if namespace still exists - if m.namespaceExists(deviceNames.Namespace) { - remainingResources = append(remainingResources, fmt.Sprintf("namespace: %s", deviceNames.Namespace)) - } - - if len(remainingResources) > 0 { - m.logger.WarnContext(ctx, "Cleanup verification detected remaining resources", - "vm_id", vmID, - "remaining_resources", remainingResources, - ) - return fmt.Errorf("cleanup incomplete: %d resources remain: %v", len(remainingResources), remainingResources) - } - - m.logger.InfoContext(ctx, "Network cleanup verification passed", "vm_id", vmID) - return nil -} - -// namespaceExists checks if a network namespace exists -func (m *Manager) namespaceExists(namespace string) bool { - // Try to get the namespace - if it exists, this won't error - if _, err := netns.GetFromName(namespace); err != nil { - return false - } - return true -} - -// GetVMNetwork returns network information for a VM -func (m *Manager) GetVMNetwork(vmID string) (*VMNetwork, error) { - m.mu.RLock() - defer m.mu.RUnlock() - - vmNet, exists := m.vmNetworks[vmID] - if !exists { - return nil, fmt.Errorf("network not found for VM %s", vmID) - } - - return vmNet, nil -} - -// Shutdown cleans up all networking resources -func (m *Manager) Shutdown(ctx context.Context) error { - m.logger.InfoContext(ctx, "shutting down network manager") - m.logNetworkState("before shutdown") - - // Stop host protection first - if err := m.hostProtection.Stop(ctx); err != nil { - m.logger.WarnContext(ctx, "failed to stop host protection", - slog.String("error", err.Error()), - ) - } - - // Delete all VM networks - vmCount := len(m.vmNetworks) - m.logger.InfoContext(ctx, "cleaning up VM networks", - slog.Int("count", vmCount), - ) - for vmID := range m.vmNetworks { - if err := m.DeleteVMNetwork(ctx, vmID); err != nil { - m.logger.ErrorContext(ctx, "failed to delete VM network during shutdown", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - } - } - - // Clean up iptables rules - m.logger.InfoContext(ctx, "cleaning up iptables rules", - slog.Int("rule_count", len(m.iptablesRules)), - ) - m.cleanupIPTables() - - // AIDEV-NOTE: We intentionally keep the bridge to avoid network disruption - // Deleting the bridge can cause host network issues if there are dependencies - m.logger.InfoContext(ctx, "keeping bridge intact to avoid network disruption", - slog.String("bridge", m.config.BridgeName), - slog.Bool("bridge_created", m.bridgeCreated), - ) - - m.logNetworkState("after shutdown") - m.logger.InfoContext(ctx, "network manager shutdown complete") - - return nil -} - -// Helper functions - -func (m *Manager) createNamespace(name string) error { - // Check if namespace already exists - if _, err := netns.GetFromName(name); err == nil { - m.logger.Debug("namespace already exists", slog.String("namespace", name)) - return nil // Already exists - } - - // Save current namespace to ensure we don't accidentally switch - origNS, err := netns.Get() - if err != nil { - return fmt.Errorf("failed to get current namespace: %w", err) - } - defer origNS.Close() - - m.logger.Info("creating network namespace", slog.String("namespace", name)) - - // Create new namespace - newNS, err := netns.NewNamed(name) - if err != nil { - m.logger.Error("failed to create namespace", - slog.String("namespace", name), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to create namespace %s: %w", name, err) - } - newNS.Close() // Close the handle immediately, we don't need it - - // Ensure we're back in the original namespace - if err := netns.Set(origNS); err != nil { - m.logger.Error("failed to restore original namespace after creation", - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to restore namespace: %w", err) - } - - m.logger.Info("namespace created successfully", slog.String("namespace", name)) - - // Create namespace directory for runtime data - nsDir := filepath.Join("/var/run/netns", name) - if err := os.MkdirAll(filepath.Dir(nsDir), 0755); err != nil { - return fmt.Errorf("failed to create namespace directory: %w", err) - } - - return nil -} - -func (m *Manager) deleteNamespace(name string) { - if err := netns.DeleteNamed(name); err != nil { - m.logger.Warn("Failed to delete namespace", "namespace", name, "error", err) - } -} - -func (m *Manager) generateMAC(vmID string) string { - // Generate deterministic MAC from VM ID - h := fnv.New32a() - h.Write([]byte(vmID)) - hash := h.Sum32() - - // Use locally administered MAC prefix (02:xx:xx:xx:xx:xx) - return fmt.Sprintf("02:00:%02x:%02x:%02x:%02x", - (hash>>24)&0xff, - (hash>>16)&0xff, - (hash>>8)&0xff, - hash&0xff, - ) -} - -func (m *Manager) cleanupIPTables() { - m.logger.Info("starting iptables cleanup", - slog.Int("rules_to_remove", len(m.iptablesRules)), - ) - - // Remove our iptables rules in reverse order - for i := len(m.iptablesRules) - 1; i >= 0; i-- { - rule := m.iptablesRules[i] - // Convert -A to -D to delete the rule - deleteRule := strings.Replace(rule, "-A", "-D", 1) - args := strings.Fields(deleteRule) - - m.logger.Info("removing iptables rule", - slog.Int("rule_index", i), - slog.String("original_rule", rule), - slog.String("delete_command", strings.Join(args, " ")), - ) - - cmd := exec.Command("iptables", args...) - if output, err := cmd.CombinedOutput(); err != nil { - m.logger.Warn("failed to remove iptables rule", - slog.String("rule", rule), - slog.String("error", err.Error()), - slog.String("output", string(output)), - ) - } else { - m.logger.Info("iptables rule removed successfully", - slog.String("rule", rule), - ) - } - } - m.iptablesRules = nil - m.logger.Info("iptables cleanup completed") -} - -// GetNetworkStats returns network statistics for a VM -func (m *Manager) GetNetworkStats(vmID string) (*NetworkStats, error) { - m.mu.RLock() - vmNet, exists := m.vmNetworks[vmID] - m.mu.RUnlock() - - if !exists { - return nil, fmt.Errorf("network not found for VM %s", vmID) - } - - // Get stats from the TAP device in the namespace - ns, err := netns.GetFromName(vmNet.Namespace) - if err != nil { - return nil, fmt.Errorf("failed to get namespace: %w", err) - } - defer ns.Close() - - origNS, err := netns.Get() - if err != nil { - return nil, fmt.Errorf("failed to get current namespace: %w", err) - } - defer origNS.Close() - - if setErr := netns.Set(ns); setErr != nil { - return nil, fmt.Errorf("failed to set namespace: %w", setErr) - } - defer func() { - if setErr := netns.Set(origNS); setErr != nil { - slog.Error("Failed to restore namespace", "error", setErr) - } - }() - - // Get TAP device stats - link, err := netlink.LinkByName(vmNet.TapDevice) - if err != nil { - return nil, fmt.Errorf("failed to get tap device: %w", err) - } - - stats := link.Attrs().Statistics - if stats == nil { - return nil, fmt.Errorf("no statistics available") - } - - return &NetworkStats{ - RxBytes: stats.RxBytes, - TxBytes: stats.TxBytes, - RxPackets: stats.RxPackets, - TxPackets: stats.TxPackets, - RxDropped: stats.RxDropped, - TxDropped: stats.TxDropped, - RxErrors: stats.RxErrors, - TxErrors: stats.TxErrors, - }, nil -} - -// isValidInterfaceName validates that an interface name is safe to use in commands -// -//nolint:unused // Used by applyRateLimit function which is reserved for future implementation -func isValidInterfaceName(name string) bool { - // Linux interface names must be 1-15 characters - if len(name) == 0 || len(name) > 15 { - return false - } - - // Must contain only alphanumeric, dash, underscore, or dot - for _, ch := range name { - if (ch < 'a' || ch > 'z') && - (ch < 'A' || ch > 'Z') && - (ch < '0' || ch > '9') && - ch != '-' && ch != '_' && ch != '.' { - return false - } - } - - return true -} - -// logNetworkState logs the current state of network interfaces and routes -func (m *Manager) logNetworkState(context string) { - m.logger.Info("network state check", - slog.String("context", context), - ) - - // Check bridge state - if link, err := netlink.LinkByName(m.config.BridgeName); err == nil { - addrs, _ := netlink.AddrList(link, netlink.FAMILY_V4) - var addrStrs []string - for _, addr := range addrs { - addrStrs = append(addrStrs, addr.IPNet.String()) - } - m.logger.Info("bridge state", - slog.String("bridge", m.config.BridgeName), - slog.String("state", link.Attrs().OperState.String()), - slog.String("flags", link.Attrs().Flags.String()), - slog.Any("addresses", addrStrs), - ) - } else { - m.logger.Info("bridge not found", - slog.String("bridge", m.config.BridgeName), - slog.String("error", err.Error()), - ) - } - - // List all interfaces - links, err := netlink.LinkList() - if err == nil { - var interfaces []string - for _, link := range links { - interfaces = append(interfaces, fmt.Sprintf("%s(%s)", link.Attrs().Name, link.Attrs().OperState.String())) - } - m.logger.Info("all interfaces", - slog.Any("interfaces", interfaces), - ) - } - - // Check default route - routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) - if err == nil { - for _, route := range routes { - if route.Dst == nil { - if link, err := netlink.LinkByIndex(route.LinkIndex); err == nil { - m.logger.Info("default route", - slog.String("interface", link.Attrs().Name), - slog.String("gateway", route.Gw.String()), - ) - } - } - } - } -} - -// AIDEV-NOTE: Port management methods for container-like networking - -// AllocatePortsForVM allocates host ports for container ports based on metadata -func (m *Manager) AllocatePortsForVM(vmID string, exposedPorts []string) ([]PortMapping, error) { - m.mu.Lock() - defer m.mu.Unlock() - - var mappings []PortMapping - - for _, portSpec := range exposedPorts { - // Parse port format: can be "80", "80/tcp", "80/udp" - parts := strings.Split(portSpec, "/") - if len(parts) == 0 { - continue - } - - var containerPort int - protocol := "tcp" // default - - if _, err := fmt.Sscanf(parts[0], "%d", &containerPort); err != nil { - m.logger.Warn("invalid port format", - slog.String("port_spec", portSpec), - slog.String("error", err.Error()), - ) - continue - } - - if len(parts) > 1 { - protocol = strings.ToLower(parts[1]) - } - - // Allocate host port - hostPort, err := m.portAllocator.AllocatePort(vmID, containerPort, protocol) - if err != nil { - // Clean up any already allocated ports - m.releaseVMPortsLocked(vmID) - return nil, fmt.Errorf("failed to allocate port %s for VM %s: %w", portSpec, vmID, err) - } - - mapping := PortMapping{ - ContainerPort: containerPort, - HostPort: hostPort, - Protocol: protocol, - VMID: vmID, - } - mappings = append(mappings, mapping) - - m.logger.Info("allocated port mapping", - slog.String("vm_id", vmID), - slog.Int("container_port", containerPort), - slog.Int("host_port", hostPort), - slog.String("protocol", protocol), - ) - } - - return mappings, nil -} - -// ReleaseVMPorts releases all ports allocated to a VM -func (m *Manager) ReleaseVMPorts(vmID string) []PortMapping { - m.mu.Lock() - defer m.mu.Unlock() - - return m.releaseVMPortsLocked(vmID) -} - -// releaseVMPortsLocked releases VM ports with lock already held -func (m *Manager) releaseVMPortsLocked(vmID string) []PortMapping { - mappings := m.portAllocator.ReleaseVMPorts(vmID) - - for _, mapping := range mappings { - m.logger.Info("released port mapping", - slog.String("vm_id", vmID), - slog.Int("container_port", mapping.ContainerPort), - slog.Int("host_port", mapping.HostPort), - slog.String("protocol", mapping.Protocol), - ) - } - - return mappings -} - -// GetVMPorts returns all port mappings for a VM -func (m *Manager) GetVMPorts(vmID string) []PortMapping { - m.mu.RLock() - defer m.mu.RUnlock() - - return m.portAllocator.GetVMPorts(vmID) -} - -// GetPortVM returns the VM ID that has allocated the given host port -func (m *Manager) GetPortVM(hostPort int) (string, bool) { - m.mu.RLock() - defer m.mu.RUnlock() - - return m.portAllocator.GetPortVM(hostPort) -} - -// IsPortAllocated checks if a host port is allocated -func (m *Manager) IsPortAllocated(hostPort int) bool { - m.mu.RLock() - defer m.mu.RUnlock() - - return m.portAllocator.IsPortAllocated(hostPort) -} - -// GetPortAllocationStats returns port allocation statistics -func (m *Manager) GetPortAllocationStats() (allocated, available int) { - m.mu.RLock() - defer m.mu.RUnlock() - - return m.portAllocator.GetAllocatedCount(), m.portAllocator.GetAvailableCount() -} - -// CleanupOrphanedResources performs administrative cleanup of orphaned network resources -// This function scans for and removes network interfaces that are no longer associated with active VMs -func (m *Manager) CleanupOrphanedResources(ctx context.Context, dryRun bool) (*CleanupReport, error) { - m.logger.InfoContext(ctx, "starting orphaned resource cleanup", - slog.Bool("dry_run", dryRun), - ) - - report := &CleanupReport{ - DryRun: dryRun, - } - - // Get all network links - links, err := netlink.LinkList() - if err != nil { - return nil, fmt.Errorf("failed to list network interfaces: %w", err) - } - - // Find orphaned TAP devices - for _, link := range links { - name := link.Attrs().Name - if strings.HasPrefix(name, "tap_") && len(name) == 12 { // tap_<8-char-id> - networkID := name[4:] // Extract the 8-char ID - if !m.isNetworkIDActive(networkID) { - report.OrphanedTAPs = append(report.OrphanedTAPs, name) - if !dryRun { - if delErr := netlink.LinkDel(link); delErr != nil { - report.Errors = append(report.Errors, fmt.Sprintf("Failed to delete TAP %s: %v", name, delErr)) - } else { - report.CleanedTAPs = append(report.CleanedTAPs, name) - } - } - } - } - } - - // Find orphaned veth pairs - for _, link := range links { - name := link.Attrs().Name - if strings.HasPrefix(name, "vh_") && len(name) == 11 { // vh_<8-char-id> - networkID := name[3:] // Extract the 8-char ID - if !m.isNetworkIDActive(networkID) { - report.OrphanedVeths = append(report.OrphanedVeths, name) - if !dryRun { - if delErr := netlink.LinkDel(link); delErr != nil { - report.Errors = append(report.Errors, fmt.Sprintf("Failed to delete veth %s: %v", name, delErr)) - } else { - report.CleanedVeths = append(report.CleanedVeths, name) - } - } - } - } - } - - // Find orphaned namespaces - // Note: This is a simplified check - in practice you'd scan /var/run/netns or use netns.ListNamed() - for vmID := range m.vmNetworks { - expectedNS := fmt.Sprintf("vm-%s", vmID) - if m.namespaceExists(expectedNS) { - // This namespace should exist, it's not orphaned - continue - } - } - - m.logger.InfoContext(ctx, "orphaned resource cleanup completed", - slog.Bool("dry_run", dryRun), - slog.Int("orphaned_taps", len(report.OrphanedTAPs)), - slog.Int("orphaned_veths", len(report.OrphanedVeths)), - slog.Int("cleaned_taps", len(report.CleanedTAPs)), - slog.Int("cleaned_veths", len(report.CleanedVeths)), - slog.Int("errors", len(report.Errors)), - ) - - return report, nil -} - -// isNetworkIDActive checks if a network ID is currently associated with an active VM -func (m *Manager) isNetworkIDActive(networkID string) bool { - m.mu.RLock() - defer m.mu.RUnlock() - - for _, vmNet := range m.vmNetworks { - if vmNet.NetworkID == networkID { - return true - } - } - return false -} - -// CleanupReport contains the results of orphaned resource cleanup -type CleanupReport struct { - DryRun bool - OrphanedTAPs []string - OrphanedVeths []string - OrphanedNS []string - CleanedTAPs []string - CleanedVeths []string - CleanedNS []string - Errors []string -} - -// GetBridgeCapacityStatus returns current bridge capacity and utilization -func (m *Manager) GetBridgeCapacityStatus() *BridgeCapacityStatus { - m.mu.RLock() - defer m.mu.RUnlock() - - bridgeStats := m.metrics.GetBridgeStats() - alerts := m.metrics.GetBridgeCapacityAlerts() - - // Calculate overall statistics - totalVMs := int64(0) - totalCapacity := int64(0) - bridgeCount := len(bridgeStats) - - bridgeDetails := make([]BridgeDetails, 0, bridgeCount) - for _, stats := range bridgeStats { - totalVMs += stats.VMCount - totalCapacity += stats.MaxVMs - - utilization := float64(stats.VMCount) / float64(stats.MaxVMs) - bridgeDetails = append(bridgeDetails, BridgeDetails{ - Name: stats.BridgeName, - VMCount: stats.VMCount, - MaxVMs: stats.MaxVMs, - Utilization: utilization, - IsHealthy: stats.IsHealthy, - CreatedAt: stats.CreatedAt, - LastActivity: stats.LastActivity, - }) - } - - overallUtilization := float64(0) - if totalCapacity > 0 { - overallUtilization = float64(totalVMs) / float64(totalCapacity) - } - - return &BridgeCapacityStatus{ - TotalVMs: totalVMs, - TotalCapacity: totalCapacity, - OverallUtilization: overallUtilization, - BridgeCount: int64(bridgeCount), - Bridges: bridgeDetails, - Alerts: alerts, - Timestamp: time.Now(), - } -} - -// GetNetworkMetrics returns the network metrics instance for external access -func (m *Manager) GetNetworkMetrics() *NetworkMetrics { - return m.metrics -} - -// BridgeCapacityStatus provides comprehensive bridge capacity information -type BridgeCapacityStatus struct { - TotalVMs int64 `json:"total_vms"` - TotalCapacity int64 `json:"total_capacity"` - OverallUtilization float64 `json:"overall_utilization"` - BridgeCount int64 `json:"bridge_count"` - Bridges []BridgeDetails `json:"bridges"` - Alerts []BridgeCapacityAlert `json:"alerts"` - Timestamp time.Time `json:"timestamp"` -} - -// BridgeDetails provides detailed information about a specific bridge -type BridgeDetails struct { - Name string `json:"name"` - VMCount int64 `json:"vm_count"` - MaxVMs int64 `json:"max_vms"` - Utilization float64 `json:"utilization"` - IsHealthy bool `json:"is_healthy"` - CreatedAt time.Time `json:"created_at"` - LastActivity time.Time `json:"last_activity"` -} diff --git a/go/deploy/metald/internal/network/manager.go b/go/deploy/metald/internal/network/manager.go new file mode 100644 index 0000000000..6ff27a8b66 --- /dev/null +++ b/go/deploy/metald/internal/network/manager.go @@ -0,0 +1,45 @@ +package network + +import ( + "fmt" + "log/slog" + "net" + "sync" +) + +// NewManager creates a new network manager to handle bridge/tap creation +func NewManager(logger *slog.Logger, config *Config) (*Manager, error) { + if config == nil { + logger.Error("creating network manager") + return nil, fmt.Errorf("network config can not be nil") + } + + logger.Info("creating network manager", + slog.String("bridge_name", config.BridgeName), + slog.String("base_network", config.BaseNetwork.String()), + ) + + m := &Manager{ //nolint:exhaustruct + logger: logger, + config: config, + } + + return m, nil +} + +// Config holds network configuration +type Config struct { + BaseNetwork *net.IPNet + BridgeName string + DNSServers []string // Default: ["8.8.8.8", "8.8.4.4"] + EnableIPv6 bool + EnableRateLimit bool + RateLimitMbps int // Per VM rate limit in Mbps +} + +type Manager struct { + logger *slog.Logger + config *Config + mu sync.RWMutex + bridgeMu sync.RWMutex +} diff --git a/go/deploy/metald/internal/network/metrics.go b/go/deploy/metald/internal/network/metrics.go deleted file mode 100644 index 6095dccc3f..0000000000 --- a/go/deploy/metald/internal/network/metrics.go +++ /dev/null @@ -1,472 +0,0 @@ -package network - -import ( - "context" - "fmt" - "log/slog" - "sync" - "time" - - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" -) - -// NetworkMetrics handles all network-related metrics for metald -type NetworkMetrics struct { - logger *slog.Logger - meter metric.Meter - - // Bridge capacity metrics - bridgeVMCount metric.Int64UpDownCounter - bridgeCapacityRatio metric.Float64Gauge - bridgeUtilization metric.Int64Histogram - - // VM network metrics - vmNetworkCreateTotal metric.Int64Counter - vmNetworkDeleteTotal metric.Int64Counter - vmNetworkErrors metric.Int64Counter - - // Resource leak metrics - orphanedTAPDevices metric.Int64UpDownCounter - orphanedVethDevices metric.Int64UpDownCounter - orphanedNamespaces metric.Int64UpDownCounter - - // Host protection metrics - routeHijackDetected metric.Int64Counter - routeRecoveryAttempts metric.Int64Counter - hostProtectionStatus metric.Int64UpDownCounter - - // Performance metrics - networkSetupDuration metric.Float64Histogram - networkCleanupDuration metric.Float64Histogram - - mutex sync.RWMutex - bridgeStats map[string]*BridgeStats -} - -// BridgeStats tracks statistics for a specific bridge -type BridgeStats struct { - BridgeName string - VMCount int64 - MaxVMs int64 - CreatedAt time.Time - LastActivity time.Time - IsHealthy bool - ErrorCount int64 -} - -// NewNetworkMetrics creates a new network metrics collector -func NewNetworkMetrics(logger *slog.Logger) (*NetworkMetrics, error) { - meter := otel.Meter("metald.network") - - // Initialize all metrics - bridgeVMCount, err := meter.Int64UpDownCounter( - "metald_bridge_vm_count", - metric.WithDescription("Current number of VMs per bridge"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - bridgeCapacityRatio, err := meter.Float64Gauge( - "metald_bridge_capacity_ratio", - metric.WithDescription("Ratio of current VMs to maximum VMs per bridge (0.0-1.0)"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - bridgeUtilization, err := meter.Int64Histogram( - "metald_bridge_utilization_percent", - metric.WithDescription("Bridge utilization percentage"), - metric.WithUnit("%"), - metric.WithExplicitBucketBoundaries(10, 25, 50, 75, 90, 95, 99), - ) - if err != nil { - return nil, err - } - - vmNetworkCreateTotal, err := meter.Int64Counter( - "metald_vm_network_create_total", - metric.WithDescription("Total number of VM network creations"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - vmNetworkDeleteTotal, err := meter.Int64Counter( - "metald_vm_network_delete_total", - metric.WithDescription("Total number of VM network deletions"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - vmNetworkErrors, err := meter.Int64Counter( - "metald_vm_network_errors_total", - metric.WithDescription("Total number of VM network errors"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - orphanedTAPDevices, err := meter.Int64UpDownCounter( - "metald_orphaned_tap_devices", - metric.WithDescription("Number of orphaned TAP devices"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - orphanedVethDevices, err := meter.Int64UpDownCounter( - "metald_orphaned_veth_devices", - metric.WithDescription("Number of orphaned veth devices"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - orphanedNamespaces, err := meter.Int64UpDownCounter( - "metald_orphaned_namespaces", - metric.WithDescription("Number of orphaned network namespaces"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - routeHijackDetected, err := meter.Int64Counter( - "metald_route_hijack_detected_total", - metric.WithDescription("Total number of route hijacking attempts detected"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - routeRecoveryAttempts, err := meter.Int64Counter( - "metald_route_recovery_attempts_total", - metric.WithDescription("Total number of route recovery attempts"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - hostProtectionStatus, err := meter.Int64UpDownCounter( - "metald_host_protection_status", - metric.WithDescription("Host protection status (1=active, 0=inactive)"), - metric.WithUnit("1"), - ) - if err != nil { - return nil, err - } - - networkSetupDuration, err := meter.Float64Histogram( - "metald_network_setup_duration_seconds", - metric.WithDescription("Time taken to set up VM networking"), - metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0), - ) - if err != nil { - return nil, err - } - - networkCleanupDuration, err := meter.Float64Histogram( - "metald_network_cleanup_duration_seconds", - metric.WithDescription("Time taken to clean up VM networking"), - metric.WithUnit("s"), - metric.WithExplicitBucketBoundaries(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0), - ) - if err != nil { - return nil, err - } - - return &NetworkMetrics{ - logger: logger.With("component", "network-metrics"), - meter: meter, - bridgeVMCount: bridgeVMCount, - bridgeCapacityRatio: bridgeCapacityRatio, - bridgeUtilization: bridgeUtilization, - vmNetworkCreateTotal: vmNetworkCreateTotal, - vmNetworkDeleteTotal: vmNetworkDeleteTotal, - vmNetworkErrors: vmNetworkErrors, - orphanedTAPDevices: orphanedTAPDevices, - orphanedVethDevices: orphanedVethDevices, - orphanedNamespaces: orphanedNamespaces, - routeHijackDetected: routeHijackDetected, - routeRecoveryAttempts: routeRecoveryAttempts, - hostProtectionStatus: hostProtectionStatus, - networkSetupDuration: networkSetupDuration, - networkCleanupDuration: networkCleanupDuration, - bridgeStats: make(map[string]*BridgeStats), - }, nil -} - -// RecordVMNetworkCreate records a VM network creation -func (m *NetworkMetrics) RecordVMNetworkCreate(ctx context.Context, bridgeName string, success bool) { - m.vmNetworkCreateTotal.Add(ctx, 1, metric.WithAttributes( - attribute.String("bridge", bridgeName), - attribute.Bool("success", success), - )) - - if success { - m.updateBridgeStats(bridgeName, 1) - } else { - m.vmNetworkErrors.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "create"), - attribute.String("bridge", bridgeName), - )) - } -} - -// RecordVMNetworkDelete records a VM network deletion -func (m *NetworkMetrics) RecordVMNetworkDelete(ctx context.Context, bridgeName string, success bool) { - m.vmNetworkDeleteTotal.Add(ctx, 1, metric.WithAttributes( - attribute.String("bridge", bridgeName), - attribute.Bool("success", success), - )) - - if success { - m.updateBridgeStats(bridgeName, -1) - } else { - m.vmNetworkErrors.Add(ctx, 1, metric.WithAttributes( - attribute.String("operation", "delete"), - attribute.String("bridge", bridgeName), - )) - } -} - -// RecordNetworkSetupDuration records the time taken for network setup -func (m *NetworkMetrics) RecordNetworkSetupDuration(ctx context.Context, duration time.Duration, bridgeName string, success bool) { - m.networkSetupDuration.Record(ctx, duration.Seconds(), metric.WithAttributes( - attribute.String("bridge", bridgeName), - attribute.Bool("success", success), - )) -} - -// RecordNetworkCleanupDuration records the time taken for network cleanup -func (m *NetworkMetrics) RecordNetworkCleanupDuration(ctx context.Context, duration time.Duration, bridgeName string, success bool) { - m.networkCleanupDuration.Record(ctx, duration.Seconds(), metric.WithAttributes( - attribute.String("bridge", bridgeName), - attribute.Bool("success", success), - )) -} - -// RecordOrphanedResources records counts of orphaned network resources -func (m *NetworkMetrics) RecordOrphanedResources(ctx context.Context, taps, veths, namespaces int64) { - m.orphanedTAPDevices.Add(ctx, taps) - m.orphanedVethDevices.Add(ctx, veths) - m.orphanedNamespaces.Add(ctx, namespaces) -} - -// RecordRouteHijackDetected records a route hijacking detection -func (m *NetworkMetrics) RecordRouteHijackDetected(ctx context.Context, hijackedInterface, expectedInterface string) { - m.routeHijackDetected.Add(ctx, 1, metric.WithAttributes( - attribute.String("hijacked_interface", hijackedInterface), - attribute.String("expected_interface", expectedInterface), - )) -} - -// RecordRouteRecoveryAttempt records a route recovery attempt -func (m *NetworkMetrics) RecordRouteRecoveryAttempt(ctx context.Context, success bool) { - m.routeRecoveryAttempts.Add(ctx, 1, metric.WithAttributes( - attribute.Bool("success", success), - )) -} - -// SetHostProtectionStatus sets the host protection status -func (m *NetworkMetrics) SetHostProtectionStatus(ctx context.Context, active bool) { - status := int64(0) - if active { - status = 1 - } - m.hostProtectionStatus.Add(ctx, status) -} - -// updateBridgeStats updates bridge statistics and capacity metrics -func (m *NetworkMetrics) updateBridgeStats(bridgeName string, vmCountDelta int64) { - m.mutex.Lock() - defer m.mutex.Unlock() - - stats, exists := m.bridgeStats[bridgeName] - if !exists { - stats = &BridgeStats{ - BridgeName: bridgeName, - VMCount: 0, - MaxVMs: 1000, // Default max VMs per bridge - CreatedAt: time.Now(), - LastActivity: time.Now(), - IsHealthy: true, - } - m.bridgeStats[bridgeName] = stats - } - - stats.VMCount += vmCountDelta - stats.LastActivity = time.Now() - - // Ensure VM count doesn't go negative - if stats.VMCount < 0 { - stats.VMCount = 0 - } - - // Update metrics - ctx := context.Background() - m.bridgeVMCount.Add(ctx, vmCountDelta, metric.WithAttributes( - attribute.String("bridge", bridgeName), - )) - - // Calculate and record capacity ratio - ratio := float64(stats.VMCount) / float64(stats.MaxVMs) - m.bridgeCapacityRatio.Record(ctx, ratio, metric.WithAttributes( - attribute.String("bridge", bridgeName), - )) - - // Calculate and record utilization percentage - utilizationPercent := int64(ratio * 100) - m.bridgeUtilization.Record(ctx, utilizationPercent, metric.WithAttributes( - attribute.String("bridge", bridgeName), - )) - - // Log warnings for high utilization - if ratio >= 0.9 { - m.logger.Warn("bridge approaching capacity", - slog.String("bridge", bridgeName), - slog.Int64("current_vms", stats.VMCount), - slog.Int64("max_vms", stats.MaxVMs), - slog.Float64("utilization", ratio), - ) - } -} - -// SetBridgeMaxVMs sets the maximum VMs for a bridge -func (m *NetworkMetrics) SetBridgeMaxVMs(bridgeName string, maxVMs int64) { - m.mutex.Lock() - defer m.mutex.Unlock() - - stats, exists := m.bridgeStats[bridgeName] - if !exists { - stats = &BridgeStats{ - BridgeName: bridgeName, - VMCount: 0, - MaxVMs: maxVMs, - CreatedAt: time.Now(), - LastActivity: time.Now(), - IsHealthy: true, - } - m.bridgeStats[bridgeName] = stats - } else { - stats.MaxVMs = maxVMs - } -} - -// GetBridgeStats returns current bridge statistics -func (m *NetworkMetrics) GetBridgeStats() map[string]*BridgeStats { - m.mutex.RLock() - defer m.mutex.RUnlock() - - // Return a copy to avoid concurrent access issues - statsCopy := make(map[string]*BridgeStats) - for name, stats := range m.bridgeStats { - statsCopy[name] = &BridgeStats{ - BridgeName: stats.BridgeName, - VMCount: stats.VMCount, - MaxVMs: stats.MaxVMs, - CreatedAt: stats.CreatedAt, - LastActivity: stats.LastActivity, - IsHealthy: stats.IsHealthy, - ErrorCount: stats.ErrorCount, - } - } - - return statsCopy -} - -// GetBridgeCapacityAlerts returns bridges that are approaching capacity -func (m *NetworkMetrics) GetBridgeCapacityAlerts() []BridgeCapacityAlert { - m.mutex.RLock() - defer m.mutex.RUnlock() - - var alerts []BridgeCapacityAlert - - for _, stats := range m.bridgeStats { - ratio := float64(stats.VMCount) / float64(stats.MaxVMs) - - var severity AlertSeverity - var threshold float64 - - switch { - case ratio >= 0.95: - severity = AlertCritical - threshold = 0.95 - case ratio >= 0.90: - severity = AlertWarning - threshold = 0.90 - case ratio >= 0.80: - severity = AlertInfo - threshold = 0.80 - default: - continue // No alert needed - } - - alerts = append(alerts, BridgeCapacityAlert{ - BridgeName: stats.BridgeName, - CurrentVMs: stats.VMCount, - MaxVMs: stats.MaxVMs, - UtilizationRatio: ratio, - Severity: severity, - Threshold: threshold, - Message: m.formatCapacityAlertMessage(stats, ratio, severity), - }) - } - - return alerts -} - -// formatCapacityAlertMessage creates a human-readable alert message -func (m *NetworkMetrics) formatCapacityAlertMessage(stats *BridgeStats, ratio float64, severity AlertSeverity) string { - utilizationPercent := int(ratio * 100) - - switch severity { - case AlertCritical: - return fmt.Sprintf("CRITICAL: Bridge %s is at %d%% capacity (%d/%d VMs). Immediate action required!", - stats.BridgeName, utilizationPercent, stats.VMCount, stats.MaxVMs) - case AlertWarning: - return fmt.Sprintf("WARNING: Bridge %s is at %d%% capacity (%d/%d VMs). Consider load balancing or scaling.", - stats.BridgeName, utilizationPercent, stats.VMCount, stats.MaxVMs) - case AlertInfo: - return fmt.Sprintf("INFO: Bridge %s utilization is %d%% (%d/%d VMs). Monitor for continued growth.", - stats.BridgeName, utilizationPercent, stats.VMCount, stats.MaxVMs) - default: - return fmt.Sprintf("Bridge %s utilization: %d%% (%d/%d VMs)", - stats.BridgeName, utilizationPercent, stats.VMCount, stats.MaxVMs) - } -} - -// BridgeCapacityAlert represents a bridge capacity alert -type BridgeCapacityAlert struct { - BridgeName string `json:"bridge_name"` - CurrentVMs int64 `json:"current_vms"` - MaxVMs int64 `json:"max_vms"` - UtilizationRatio float64 `json:"utilization_ratio"` - Severity AlertSeverity `json:"severity"` - Threshold float64 `json:"threshold"` - Message string `json:"message"` -} - -// AlertSeverity represents the severity level of an alert -type AlertSeverity string - -const ( - AlertInfo AlertSeverity = "info" - AlertWarning AlertSeverity = "warning" - AlertCritical AlertSeverity = "critical" -) diff --git a/go/deploy/metald/internal/network/network.go b/go/deploy/metald/internal/network/network.go new file mode 100644 index 0000000000..034342814c --- /dev/null +++ b/go/deploy/metald/internal/network/network.go @@ -0,0 +1,117 @@ +package network + +import ( + "fmt" + "hash/fnv" +) + +const ( + // BaseNetwork is the "root" network we're partitioning + BaseNetwork = "172.16.0.0/12" + + // SubnetPrefix is the size of each subnet + SubnetPrefix = 28 + + // BasePrefix is the size of the base network + BasePrefix = 12 + + // TotalSubnets is the total number of /28 subnets in a /12 + TotalSubnets = 1 << (SubnetPrefix - BasePrefix) // 65536 = 1 << (28 - 12) + + // IPsPerSubnet is the number of IPs in each /28 subnet + IPsPerSubnet = 1 << (32 - SubnetPrefix) // 16 +) + +// SubnetInfo contains all the network information for a subnet +type SubnetInfo struct { + Index uint32 // 0-based index (0-65535) + Network string // CIDR notation (e.g., "172.16.0.0/28") + Gateway string // Gateway IP (e.g., "172.16.0.1") + Broadcast string // Broadcast IP (e.g., "172.16.0.15") + UsableRange string // Usable IP range (e.g., "172.16.0.2-172.16.0.14") + UsableIPs int // Number of usable IPs (13 for /28) +} + +// CalculateIndex returns a subnet index (0-65535) for a given workspace ID +func CalculateIndex(identifier string) uint32 { + hash := fnv.New32a() + hash.Write([]byte(identifier)) + return hash.Sum32() & 0xFFFF // Masks to keep only lower 16 bits +} + +// CalculateIndexOneBased returns a subnet index (1-65536) for a given workspace ID +func CalculateIndexOneBased(identifier string) uint32 { + return CalculateIndex(identifier) + 1 +} + +// GetSubnetInfo returns complete subnet information for a workspace ID +func GetSubnetInfo(identifier string) SubnetInfo { + index := CalculateIndex(identifier) + return GetSubnetInfoByIndex(index) +} + +// GetSubnetInfoByIndex returns complete subnet information for a given index +func GetSubnetInfoByIndex(index uint32) SubnetInfo { + if index >= TotalSubnets { + panic(fmt.Sprintf("index %d exceeds maximum subnet count %d", index, TotalSubnets)) + } + + // Calculate the base IP for this subnet + subnetOffset := index * IPsPerSubnet + + // Calculate octets for 172.16.0.0/12 base + octet4 := subnetOffset % 256 + octet3 := (subnetOffset / 256) % 256 + octet2 := 16 + (subnetOffset / 65536) + + // Build the subnet info + info := SubnetInfo{ + Index: index, + Network: fmt.Sprintf("172.%d.%d.%d/%d", octet2, octet3, octet4, SubnetPrefix), + Gateway: fmt.Sprintf("172.%d.%d.%d", octet2, octet3, octet4+1), + Broadcast: fmt.Sprintf("172.%d.%d.%d", octet2, octet3, octet4+15), + UsableIPs: IPsPerSubnet - 3, // Exclude network, gateway, and broadcast + } + + // Calculate usable range + usableStart := fmt.Sprintf("172.%d.%d.%d", octet2, octet3, octet4+2) + usableEnd := fmt.Sprintf("172.%d.%d.%d", octet2, octet3, octet4+14) + info.UsableRange = fmt.Sprintf("%s-%s", usableStart, usableEnd) + + return info +} + +// GetNetwork returns just the CIDR notation for a workspace ID +func GetNetwork(identifier string) string { + info := GetSubnetInfo(identifier) + return info.Network +} + +// GetGateway returns the gateway IP for a workspace ID +func GetGateway(identifier string) string { + info := GetSubnetInfo(identifier) + return info.Gateway +} + +// Validateidentifier checks if a workspace ID is valid (non-empty) +func ValidateIdentifier(identifier string) error { + if identifier == "" { + return fmt.Errorf("workspace ID cannot be empty") + } + return nil +} + +// GetAllSubnetsInRange returns a slice of all possible subnet indices +// Useful for iteration or validation +func GetAllSubnetsInRange() []uint32 { + subnets := make([]uint32, TotalSubnets) + for i := range TotalSubnets { + subnets[i] = uint32(i) + } + return subnets +} + +// IsValidIndex checks if an index is within the valid range +func IsValidIndex(index uint32) bool { + return index < TotalSubnets +} diff --git a/go/deploy/metald/internal/network/port_allocator.go b/go/deploy/metald/internal/network/port_allocator.go index 5e0c657e48..646845e3ab 100644 --- a/go/deploy/metald/internal/network/port_allocator.go +++ b/go/deploy/metald/internal/network/port_allocator.go @@ -1,21 +1,18 @@ package network import ( + "crypto/rand" "fmt" - "math/rand" + "math/big" "sync" - "time" ) -// AIDEV-NOTE: Port allocator manages host port allocation for container port mapping -// This prevents port conflicts between VMs and provides dynamic port allocation - -// PortMapping represents a mapping from container port to host port +// PortMapping represents a mapping from VM port to host port type PortMapping struct { - ContainerPort int `json:"container_port"` - HostPort int `json:"host_port"` - Protocol string `json:"protocol"` // tcp or udp - VMID string `json:"vm_id"` + VMPort int `json:"vm_port"` + HostPort int `json:"host_port"` + Protocol string `json:"protocol"` // tcp or udp + VMID string `json:"vm_id"` } // PortAllocator manages host port allocation for VMs @@ -29,9 +26,6 @@ type PortAllocator struct { vmPorts map[string][]PortMapping // VM ID -> port mappings portToVM map[int]string // host port -> VM ID - // Random number generator for port selection - rng *rand.Rand - mu sync.Mutex } @@ -50,7 +44,6 @@ func NewPortAllocator(minPort, maxPort int) *PortAllocator { allocated: make(map[int]bool), vmPorts: make(map[string][]PortMapping), portToVM: make(map[int]string), - rng: rand.New(rand.NewSource(time.Now().UnixNano())), } } @@ -71,9 +64,14 @@ func (p *PortAllocator) AllocatePort(vmID string, containerPort int, protocol st maxAttempts = 1000 // Limit attempts to avoid long search times } - // Try random ports first + // Try random ports first using crypto/rand for security for attempt := 0; attempt < maxAttempts; attempt++ { - hostPort := p.minPort + p.rng.Intn(portRange) + randomOffset, err := rand.Int(rand.Reader, big.NewInt(int64(portRange))) + if err != nil { + // If crypto/rand fails, fall through to sequential search + break + } + hostPort := p.minPort + int(randomOffset.Int64()) if !p.allocated[hostPort] { return p.doAllocatePort(vmID, hostPort, containerPort, protocol) } @@ -114,12 +112,12 @@ func (p *PortAllocator) AllocateSpecificPort(vmID string, hostPort, containerPor } // doAllocatePort performs the actual port allocation (internal helper) -func (p *PortAllocator) doAllocatePort(vmID string, hostPort, containerPort int, protocol string) (int, error) { +func (p *PortAllocator) doAllocatePort(vmID string, hostPort, vmPort int, protocol string) (int, error) { // Check for conflicting mapping for same VM if mappings, exists := p.vmPorts[vmID]; exists { for _, mapping := range mappings { - if mapping.ContainerPort == containerPort && mapping.Protocol == protocol { - return 0, fmt.Errorf("VM %s already has mapping for %s:%d", vmID, protocol, containerPort) + if mapping.VMPort == vmPort && mapping.Protocol == protocol { + return 0, fmt.Errorf("VM %s already has mapping for %s:%d", vmID, protocol, vmPort) } } } @@ -130,10 +128,10 @@ func (p *PortAllocator) doAllocatePort(vmID string, hostPort, containerPort int, // Create mapping mapping := PortMapping{ - ContainerPort: containerPort, - HostPort: hostPort, - Protocol: protocol, - VMID: vmID, + VMPort: vmPort, + HostPort: hostPort, + Protocol: protocol, + VMID: vmID, } // Add to VM's port list diff --git a/go/deploy/metald/internal/network/protection.go b/go/deploy/metald/internal/network/protection.go deleted file mode 100644 index d64f7df48b..0000000000 --- a/go/deploy/metald/internal/network/protection.go +++ /dev/null @@ -1,376 +0,0 @@ -package network - -import ( - "context" - "fmt" - "log/slog" - "net" - "os/exec" - "strings" - "sync" - "time" - - "github.com/unkeyed/unkey/go/deploy/metald/internal/config" - "github.com/vishvananda/netlink" -) - -// HostProtection monitors and protects the host's primary network interface -// from being hijacked by metald bridges -type HostProtection struct { - logger *slog.Logger - config *config.NetworkConfig - primaryIface string - originalRoutes []netlink.Route - originalDNS []string - monitorActive bool - mutex sync.RWMutex - stopChan chan struct{} -} - -// NewHostProtection creates a new host protection system -func NewHostProtection(logger *slog.Logger, netConfig *config.NetworkConfig) *HostProtection { - return &HostProtection{ - logger: logger.With("component", "host-protection"), - config: netConfig, - stopChan: make(chan struct{}), - } -} - -// Start initializes and starts the host protection system -func (p *HostProtection) Start(ctx context.Context) error { - if !p.config.EnableHostProtection { - p.logger.InfoContext(ctx, "host protection disabled") - return nil - } - - p.logger.InfoContext(ctx, "starting host network protection") - - // 1. Detect primary interface - if err := p.detectPrimaryInterface(); err != nil { - return fmt.Errorf("failed to detect primary interface: %w", err) - } - - // 2. Snapshot current network state - if err := p.snapshotNetworkState(); err != nil { - return fmt.Errorf("failed to snapshot network state: %w", err) - } - - // 3. Install protective iptables rules - if err := p.installProtectiveRules(); err != nil { - return fmt.Errorf("failed to install protective rules: %w", err) - } - - // 4. Start monitoring - go p.monitorNetworkChanges(ctx) - - p.logger.InfoContext(ctx, "host protection started successfully", - slog.String("primary_interface", p.primaryIface), - slog.Int("protected_routes", len(p.originalRoutes)), - ) - - return nil -} - -// Stop shuts down the host protection system -func (p *HostProtection) Stop(ctx context.Context) error { - if !p.config.EnableHostProtection { - return nil - } - - p.logger.InfoContext(ctx, "stopping host protection") - - p.mutex.Lock() - p.monitorActive = false - p.mutex.Unlock() - - close(p.stopChan) - - // Clean up protective iptables rules - if err := p.removeProtectiveRules(); err != nil { - p.logger.WarnContext(ctx, "failed to remove protective rules", "error", err) - } - - p.logger.InfoContext(ctx, "host protection stopped") - return nil -} - -// detectPrimaryInterface finds the primary network interface -func (p *HostProtection) detectPrimaryInterface() error { - if p.config.PrimaryInterface != "" { - p.primaryIface = p.config.PrimaryInterface - p.logger.Info("using configured primary interface", - slog.String("interface", p.primaryIface)) - return nil - } - - // Find default route interface - routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) - if err != nil { - return fmt.Errorf("failed to list routes: %w", err) - } - - for _, route := range routes { - if route.Dst == nil { // Default route - link, err := netlink.LinkByIndex(route.LinkIndex) - if err == nil { - // Skip virtual interfaces - ifaceName := link.Attrs().Name - if !p.isVirtualInterface(ifaceName) { - p.primaryIface = ifaceName - p.logger.Info("detected primary interface", - slog.String("interface", p.primaryIface), - slog.String("type", link.Type()), - ) - return nil - } - } - } - } - - return fmt.Errorf("could not detect primary interface") -} - -// isVirtualInterface checks if an interface is virtual (should be ignored) -func (p *HostProtection) isVirtualInterface(name string) bool { - virtualPrefixes := []string{ - "lo", "docker", "br-", "virbr", "veth", "tap_", "vh_", "vn_", - "metald-", "tun", "bridge", "dummy", "bond", "team", - } - - for _, prefix := range virtualPrefixes { - if strings.HasPrefix(name, prefix) { - return true - } - } - - return false -} - -// snapshotNetworkState captures the current network configuration -func (p *HostProtection) snapshotNetworkState() error { - // Capture routes - routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) - if err != nil { - return fmt.Errorf("failed to capture routes: %w", err) - } - - // Filter routes for primary interface - for _, route := range routes { - if link, err := netlink.LinkByIndex(route.LinkIndex); err == nil { - if link.Attrs().Name == p.primaryIface { - p.originalRoutes = append(p.originalRoutes, route) - } - } - } - - p.logger.Info("captured network state snapshot", - slog.Int("routes", len(p.originalRoutes)), - slog.String("primary_interface", p.primaryIface), - ) - - return nil -} - -// installProtectiveRules installs iptables rules to prevent bridge hijacking -func (p *HostProtection) installProtectiveRules() error { - rules := [][]string{ - // Mark traffic from metald bridges - {"-t", "mangle", "-I", "POSTROUTING", "1", "-o", "br-vms", "-j", "MARK", "--set-mark", "0x100"}, - {"-t", "mangle", "-I", "POSTROUTING", "1", "-o", "metald-br+", "-j", "MARK", "--set-mark", "0x100"}, - - // Ensure host traffic uses primary interface (higher priority) - {"-t", "mangle", "-I", "OUTPUT", "1", "-o", p.primaryIface, "-j", "MARK", "--set-mark", "0x200"}, - - // Protect against bridge route hijacking - {"-t", "mangle", "-I", "PREROUTING", "1", "-i", "br-vms", "-j", "MARK", "--set-mark", "0x100"}, - {"-t", "mangle", "-I", "PREROUTING", "1", "-i", "metald-br+", "-j", "MARK", "--set-mark", "0x100"}, - } - - for _, rule := range rules { - cmd := exec.Command("iptables", rule...) - if err := cmd.Run(); err != nil { - p.logger.Warn("failed to install protective rule", - slog.Any("rule", rule), - slog.String("error", err.Error()), - ) - // Don't fail completely - some rules might work - } - } - - p.logger.Info("installed protective iptables rules") - return nil -} - -// removeProtectiveRules removes the protective iptables rules -func (p *HostProtection) removeProtectiveRules() error { - rules := [][]string{ - // Remove in reverse order - {"-t", "mangle", "-D", "PREROUTING", "-i", "metald-br+", "-j", "MARK", "--set-mark", "0x100"}, - {"-t", "mangle", "-D", "PREROUTING", "-i", "br-vms", "-j", "MARK", "--set-mark", "0x100"}, - {"-t", "mangle", "-D", "OUTPUT", "-o", p.primaryIface, "-j", "MARK", "--set-mark", "0x200"}, - {"-t", "mangle", "-D", "POSTROUTING", "-o", "metald-br+", "-j", "MARK", "--set-mark", "0x100"}, - {"-t", "mangle", "-D", "POSTROUTING", "-o", "br-vms", "-j", "MARK", "--set-mark", "0x100"}, - } - - for _, rule := range rules { - cmd := exec.Command("iptables", rule...) - _ = cmd.Run() // Ignore errors during cleanup - } - - return nil -} - -// monitorNetworkChanges monitors for network changes that could affect host connectivity -func (p *HostProtection) monitorNetworkChanges(ctx context.Context) { - p.mutex.Lock() - p.monitorActive = true - p.mutex.Unlock() - - ticker := time.NewTicker(10 * time.Second) // Check every 10 seconds - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-p.stopChan: - return - case <-ticker.C: - p.checkNetworkHealth(ctx) - } - } -} - -// checkNetworkHealth verifies that host networking is still healthy -func (p *HostProtection) checkNetworkHealth(ctx context.Context) { - p.mutex.RLock() - if !p.monitorActive { - p.mutex.RUnlock() - return - } - p.mutex.RUnlock() - - // 1. Check if primary interface still exists and is up - if err := p.checkPrimaryInterface(); err != nil { - p.logger.WarnContext(ctx, "primary interface check failed", "error", err) - return - } - - // 2. Check for route hijacking - if hijacked := p.detectRouteHijacking(); hijacked { - p.logger.ErrorContext(ctx, "CRITICAL: route hijacking detected, attempting recovery") - if err := p.recoverHostRoutes(); err != nil { - p.logger.ErrorContext(ctx, "failed to recover host routes", "error", err) - } - } - - // 3. Check connectivity - if err := p.checkConnectivity(); err != nil { - p.logger.WarnContext(ctx, "connectivity check failed", "error", err) - } -} - -// checkPrimaryInterface verifies the primary interface is still healthy -func (p *HostProtection) checkPrimaryInterface() error { - link, err := netlink.LinkByName(p.primaryIface) - if err != nil { - return fmt.Errorf("primary interface %s not found: %w", p.primaryIface, err) - } - - if link.Attrs().OperState != netlink.OperUp { - return fmt.Errorf("primary interface %s is not up: %s", p.primaryIface, link.Attrs().OperState) - } - - return nil -} - -// detectRouteHijacking checks if metald bridges have hijacked routing -func (p *HostProtection) detectRouteHijacking() bool { - routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) - if err != nil { - p.logger.Warn("failed to list routes for hijacking detection", "error", err) - return false - } - - // Look for default routes pointing to metald bridges - for _, route := range routes { - if route.Dst == nil { // Default route - if link, err := netlink.LinkByIndex(route.LinkIndex); err == nil { - name := link.Attrs().Name - if strings.HasPrefix(name, "br-vms") || strings.HasPrefix(name, "metald-br") { - p.logger.Error("route hijacking detected", - slog.String("hijacked_interface", name), - slog.String("expected_interface", p.primaryIface), - ) - return true - } - } - } - } - - return false -} - -// recoverHostRoutes attempts to restore proper host routing -func (p *HostProtection) recoverHostRoutes() error { - p.logger.Info("attempting to recover host routes") - - // Get current routes - currentRoutes, err := netlink.RouteList(nil, netlink.FAMILY_V4) - if err != nil { - return fmt.Errorf("failed to list current routes: %w", err) - } - - // Remove any default routes pointing to metald bridges - for _, route := range currentRoutes { - if route.Dst == nil { // Default route - if link, err := netlink.LinkByIndex(route.LinkIndex); err == nil { - name := link.Attrs().Name - if strings.HasPrefix(name, "br-vms") || strings.HasPrefix(name, "metald-br") { - if delErr := netlink.RouteDel(&route); delErr != nil { - p.logger.Warn("failed to delete hijacked route", - slog.String("interface", name), - slog.String("error", delErr.Error()), - ) - } else { - p.logger.Info("removed hijacked route", slog.String("interface", name)) - } - } - } - } - } - - return nil -} - -// checkConnectivity tests basic internet connectivity -func (p *HostProtection) checkConnectivity() error { - // Try to resolve a DNS name - _, err := net.LookupHost("google.com") - if err != nil { - return fmt.Errorf("DNS resolution failed: %w", err) - } - - return nil -} - -// GetStatus returns the current status of host protection -func (p *HostProtection) GetStatus() *HostProtectionStatus { - p.mutex.RLock() - defer p.mutex.RUnlock() - - return &HostProtectionStatus{ - Enabled: p.config.EnableHostProtection, - Active: p.monitorActive, - PrimaryInterface: p.primaryIface, - ProtectedRoutes: len(p.originalRoutes), - } -} - -// HostProtectionStatus represents the current status of host protection -type HostProtectionStatus struct { - Enabled bool `json:"enabled"` - Active bool `json:"active"` - PrimaryInterface string `json:"primary_interface"` - ProtectedRoutes int `json:"protected_routes"` -} diff --git a/go/deploy/metald/internal/network/tuntap.go b/go/deploy/metald/internal/network/tuntap.go new file mode 100644 index 0000000000..9b3cf43dab --- /dev/null +++ b/go/deploy/metald/internal/network/tuntap.go @@ -0,0 +1,29 @@ +package network + +// setupVMNetworking creates and configures TAP and veth devices for a VM +func (m *Manager) setupTunTap(ip string) error { + // // tap := &netlink.Tuntap{ + // // LinkAttrs: netlink.LinkAttrs{ + // // Name: deviceNames.TAP, + // // }, + // // Mode: netlink.TUNTAP_MODE_TAP, + // // } + + // // if err := netlink.LinkAdd(tap); err != nil { + // // return fmt.Errorf("failed to create TAP device %s: %w", deviceNames.TAP, err) + // // } + + // // tapLink, err := netlink.LinkByName(deviceNames.TAP) + // // if err != nil { + // // return fmt.Errorf("failed to get TAP device: %w", err) + // // } + + // // Set TAP device up + // if err := netlink.LinkSetUp(tap); err != nil { + // return fmt.Errorf("failed to bring TAP device up: %w", err) + // } + + // m.logger.Info("TAP device created and up", slog.String("tap", tapLink.Attrs().Name)) + + return nil +} diff --git a/go/deploy/metald/internal/network/types.go b/go/deploy/metald/internal/network/types.go index 15260e5136..b0a608b18e 100644 --- a/go/deploy/metald/internal/network/types.go +++ b/go/deploy/metald/internal/network/types.go @@ -1,26 +1,67 @@ package network import ( - "fmt" + "log/slog" "net" + "sync" "time" ) +// BridgeManager manages workspace allocation across multiple bridges +type BridgeManager struct { + bridgeCount int // 8 or 32 bridges + bridgePrefix string // "br-vms" -> br-vms-0, br-vms-1, etc. + workspaces map[string]*WorkspaceAllocation // workspace_id -> allocation + bridgeUsage map[int]map[string]bool // bridge_num -> workspace_id -> exists + mu sync.RWMutex + statePath string // Path to state persistence file + logger *slog.Logger // Structured logger for state operations +} + +// BridgeState represents the serializable state for persistence +type BridgeState struct { + Workspaces map[string]*WorkspaceAllocation `json:"workspaces"` + BridgeUsage map[int]map[string]bool `json:"bridge_usage"` + LastSaved time.Time `json:"last_saved"` + Checksum string `json:"checksum"` // SHA256 checksum for integrity validation +} + +type MultiBridgeManager struct { + bridgeCount int // 8 or 32 bridges + bridgePrefix string // "br-vms" -> br-vms-0, br-vms-1, etc. + workspaces map[string]*WorkspaceAllocation // workspace_id -> allocation + bridgeUsage map[int]map[string]bool // bridge_num -> workspace_id -> exists + mu sync.RWMutex + vlanRangeStart int // Starting VLAN ID (100) + vlanRangeEnd int // Ending VLAN ID (4000) + statePath string // Path to state persistence file + logger *slog.Logger // Structured logger for state operations +} + +// WorkspaceAllocation represents a workspace's network allocation +type WorkspaceAllocation struct { + WorkspaceID string `json:"workspace_id"` + BridgeNumber int `json:"bridge_number"` // 0-31 + BridgeName string `json:"bridge_name"` // br-vms-N + CreatedAt string `json:"created_at"` + VMCount int `json:"vm_count"` // Track VM count for IP allocation +} + // VMNetwork contains network configuration for a VM type VMNetwork struct { - VMID string `json:"vm_id"` - NetworkID string `json:"network_id"` // AIDEV-NOTE: Internal 8-char ID for network device naming - Namespace string `json:"namespace"` - TapDevice string `json:"tap_device"` - IPAddress net.IP `json:"ip_address"` - Netmask net.IPMask `json:"netmask"` - Gateway net.IP `json:"gateway"` - MacAddress string `json:"mac_address"` - DNSServers []string `json:"dns_servers"` - CreatedAt time.Time `json:"created_at"` + VMID string `json:"vm_id"` + NetworkID string `json:"network_id"` // AIDEV-NOTE: Internal 8-char ID for network device naming + WorkspaceID string `json:"workspace_id"` // AIDEV-NOTE: Track workspace for proper IP release + Namespace string `json:"namespace"` + TapDevice string `json:"tap_device"` + IPAddress net.IP `json:"ip_address"` + Netmask net.IPMask `json:"netmask"` + Gateway net.IP `json:"gateway"` + MacAddress string `json:"mac_address"` + DNSServers []string `json:"dns_servers"` + CreatedAt time.Time `json:"created_at"` - // Optional fields for advanced configurations - VLANID int `json:"vlan_id,omitempty"` + // Optional fields for advanced configuration IPv6Address net.IP `json:"ipv6_address,omitempty"` Routes []Route `json:"routes,omitempty"` } @@ -64,55 +105,3 @@ type FirewallRule struct { Action string `json:"action"` // "allow" or "deny" Priority int `json:"priority"` // Lower number = higher priority } - -// GenerateCloudInitNetwork generates cloud-init network configuration -func (n *VMNetwork) GenerateCloudInitNetwork() map[string]interface{} { - // Generate network configuration for cloud-init - config := map[string]interface{}{ - "version": 2, - "ethernets": map[string]interface{}{ - "eth0": map[string]interface{}{ - "match": map[string]interface{}{ - "macaddress": n.MacAddress, - }, - "addresses": []string{ - n.IPAddress.String() + "/24", - }, - "gateway4": n.Gateway.String(), - "nameservers": map[string]interface{}{ - "addresses": n.DNSServers, - }, - }, - }, - } - - return config -} - -// GenerateNetworkMetadata generates metadata for the VM -func (n *VMNetwork) GenerateNetworkMetadata() map[string]string { - metadata := map[string]string{ - "local-ipv4": n.IPAddress.String(), - "mac": n.MacAddress, - "gateway": n.Gateway.String(), - "netmask": n.Netmask.String(), - "dns-nameservers": n.DNSServers[0], - } - - if len(n.DNSServers) > 1 { - metadata["dns-nameservers-secondary"] = n.DNSServers[1] - } - - return metadata -} - -// KernelCmdlineArgs returns kernel command line arguments for network configuration -func (n *VMNetwork) KernelCmdlineArgs() string { - // Format: ip=:::::: - // Example: ip=10.100.1.2::10.100.0.1:255.255.255.0:vm::off - return fmt.Sprintf("ip=%s::%s:%s:vm::off", - n.IPAddress.String(), - n.Gateway.String(), - n.Netmask.String(), - ) -} diff --git a/go/deploy/metald/internal/reconciler/vm_reconciler.go b/go/deploy/metald/internal/reconciler/vm_reconciler.go deleted file mode 100644 index 0e91949c3a..0000000000 --- a/go/deploy/metald/internal/reconciler/vm_reconciler.go +++ /dev/null @@ -1,476 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "log/slog" - "os" - "strconv" - "strings" - "time" - - "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" - "github.com/unkeyed/unkey/go/deploy/metald/internal/database" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" -) - -// VMReconciler handles VM state reconciliation between database and reality -type VMReconciler struct { - logger *slog.Logger - backend types.Backend - vmRepo *database.VMRepository - interval time.Duration - stopChan chan struct{} -} - -// NewVMReconciler creates a new VM reconciler -func NewVMReconciler(logger *slog.Logger, backend types.Backend, vmRepo *database.VMRepository, interval time.Duration) *VMReconciler { - return &VMReconciler{ - logger: logger.With("component", "vm-reconciler"), - backend: backend, - vmRepo: vmRepo, - interval: interval, - stopChan: make(chan struct{}), - } -} - -// Start begins the reconciliation process -func (r *VMReconciler) Start(ctx context.Context) { - r.logger.InfoContext(ctx, "starting VM reconciler", - slog.Duration("interval", r.interval), - ) - - // Run initial reconciliation immediately - r.reconcileOnce(ctx) - - // Start periodic reconciliation - ticker := time.NewTicker(r.interval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - r.logger.InfoContext(ctx, "VM reconciler stopped due to context cancellation") - return - case <-r.stopChan: - r.logger.InfoContext(ctx, "VM reconciler stopped") - return - case <-ticker.C: - r.reconcileOnce(ctx) - } - } -} - -// Stop stops the reconciliation process -func (r *VMReconciler) Stop() { - close(r.stopChan) -} - -// ReconcileNow triggers an immediate reconciliation -func (r *VMReconciler) ReconcileNow(ctx context.Context) *ReconciliationReport { - return r.reconcileOnce(ctx) -} - -// reconcileOnce performs a single reconciliation cycle -func (r *VMReconciler) reconcileOnce(ctx context.Context) *ReconciliationReport { - startTime := time.Now() - - r.logger.InfoContext(ctx, "starting VM reconciliation cycle") - - report := &ReconciliationReport{ - StartTime: startTime, - } - - // 1. Get all VMs from database - dbVMs, err := r.vmRepo.ListAllVMsWithContext(ctx) - if err != nil { - r.logger.ErrorContext(ctx, "failed to list VMs from database", - slog.String("error", err.Error()), - ) - report.Errors = append(report.Errors, fmt.Sprintf("database query failed: %v", err)) - return report - } - - report.DatabaseVMCount = len(dbVMs) - r.logger.InfoContext(ctx, "found VMs in database", - slog.Int("count", len(dbVMs)), - ) - - // 2. Get all running Firecracker processes - runningProcesses, err := r.getRunningFirecrackerProcesses() - if err != nil { - r.logger.WarnContext(ctx, "failed to get running Firecracker processes", - slog.String("error", err.Error()), - ) - report.Errors = append(report.Errors, fmt.Sprintf("process scan failed: %v", err)) - } - - report.RunningProcessCount = len(runningProcesses) - r.logger.InfoContext(ctx, "found running Firecracker processes", - slog.Int("count", len(runningProcesses)), - ) - - // 3. Reconcile each VM - for _, vm := range dbVMs { - vmReport := r.reconcileVM(ctx, vm, runningProcesses) - report.VMReports = append(report.VMReports, vmReport) - - switch vmReport.Action { - case ReconcileActionMarkDead: - report.MarkedDead++ - case ReconcileActionUpdateState: - report.StateUpdated++ - case ReconcileActionDeleteOrphan: - report.OrphansDeleted++ - case ReconcileActionNoChange: - report.NoChangeNeeded++ - case ReconcileActionError: - report.ErrorCount++ - } - } - - report.Duration = time.Since(startTime) - - r.logger.InfoContext(ctx, "VM reconciliation cycle completed", - slog.Duration("duration", report.Duration), - slog.Int("database_vms", report.DatabaseVMCount), - slog.Int("running_processes", report.RunningProcessCount), - slog.Int("marked_dead", report.MarkedDead), - slog.Int("state_updated", report.StateUpdated), - slog.Int("orphans_deleted", report.OrphansDeleted), - slog.Int("no_change", report.NoChangeNeeded), - slog.Int("errors", report.ErrorCount), - ) - - return report -} - -// reconcileVM reconciles a single VM's state -func (r *VMReconciler) reconcileVM(ctx context.Context, vm *database.VM, runningProcesses map[string]FirecrackerProcess) VMReconciliationReport { - vmReport := VMReconciliationReport{ - VMID: vm.ID, - DatabaseState: metaldv1.VmState(vm.State), - } - - // Handle nil ProcessID safely - processID := "" - if vm.ProcessID != nil { - processID = *vm.ProcessID - vmReport.ProcessID = processID - } - - // Check if the VM process is actually running - isProcessRunning := false - if processID != "" { - if proc, exists := runningProcesses[processID]; exists { - isProcessRunning = true - vmReport.ProcessExists = true - vmReport.ProcessInfo = proc - } - } - - // Determine what action to take based on database state vs reality - switch metaldv1.VmState(vm.State) { - case metaldv1.VmState_VM_STATE_RUNNING, metaldv1.VmState_VM_STATE_CREATED: - if !isProcessRunning { - // VM is supposed to be running but process doesn't exist - r.logger.WarnContext(ctx, "VM marked as running but process not found - marking as shutdown", - slog.String("vm_id", vm.ID), - slog.String("database_state", metaldv1.VmState(vm.State).String()), - slog.String("process_id", processID), - ) - - // Mark VM as shutdown in database - if err := r.markVMDead(ctx, vm.ID, "process not found during reconciliation"); err != nil { - vmReport.Action = ReconcileActionError - vmReport.Error = fmt.Sprintf("failed to mark VM as shutdown: %v", err) - } else { - vmReport.Action = ReconcileActionMarkDead - vmReport.NewState = metaldv1.VmState_VM_STATE_SHUTDOWN - } - } else { - // VM and process both exist - state is consistent - vmReport.Action = ReconcileActionNoChange - } - - case metaldv1.VmState_VM_STATE_SHUTDOWN, metaldv1.VmState_VM_STATE_PAUSED: - if isProcessRunning { - // VM is marked as dead but process is still running - update state - r.logger.InfoContext(ctx, "VM marked as shutdown but process is running - updating state", - slog.String("vm_id", vm.ID), - slog.String("database_state", metaldv1.VmState(vm.State).String()), - slog.String("process_id", processID), - ) - - if err := r.updateVMState(ctx, vm.ID, metaldv1.VmState_VM_STATE_RUNNING); err != nil { - vmReport.Action = ReconcileActionError - vmReport.Error = fmt.Sprintf("failed to update VM state: %v", err) - } else { - vmReport.Action = ReconcileActionUpdateState - vmReport.NewState = metaldv1.VmState_VM_STATE_RUNNING - } - } else { - // VM and process are both shutdown - check if this is an orphaned record - if r.isOrphanedRecord(ctx, vm) { - r.logger.WarnContext(ctx, "detected orphaned database record - deleting", - slog.String("vm_id", vm.ID), - slog.Time("updated_at", vm.UpdatedAt), - slog.Duration("age", time.Since(vm.UpdatedAt)), - ) - - if err := r.deleteOrphanedVM(ctx, vm.ID); err != nil { - vmReport.Action = ReconcileActionError - vmReport.Error = fmt.Sprintf("failed to delete orphaned VM: %v", err) - } else { - vmReport.Action = ReconcileActionDeleteOrphan - } - } else { - // Valid shutdown VM - leave it alone - vmReport.Action = ReconcileActionNoChange - } - } - - default: - // Unknown state - vmReport.Action = ReconcileActionNoChange - } - - return vmReport -} - -// markVMDead marks a VM as dead in the database -func (r *VMReconciler) markVMDead(ctx context.Context, vmID, reason string) error { - return r.vmRepo.UpdateVMStateWithContextInt(ctx, vmID, int(metaldv1.VmState_VM_STATE_SHUTDOWN)) -} - -// updateVMState updates a VM's state in the database -func (r *VMReconciler) updateVMState(ctx context.Context, vmID string, newState metaldv1.VmState) error { - return r.vmRepo.UpdateVMStateWithContextInt(ctx, vmID, int(newState)) -} - -// isOrphanedRecord determines if a shutdown VM is actually an orphaned database record -// Uses defense-in-depth approach: age-based + validation-based + tracking-based checks -func (r *VMReconciler) isOrphanedRecord(ctx context.Context, vm *database.VM) bool { - now := time.Now() - - // Defense 1: Age-based check - very conservative threshold - shutdownAge := now.Sub(vm.UpdatedAt) - if shutdownAge < OrphanedRecordAgeThreshold { - r.logger.DebugContext(ctx, "VM not old enough to be considered orphaned", - slog.String("vm_id", vm.ID), - slog.Duration("age", shutdownAge), - slog.Duration("threshold", OrphanedRecordAgeThreshold), - ) - return false - } - - // Defense 2: Validation-based check - verify VM resources don't exist - if r.vmResourcesExist(ctx, vm) { - r.logger.DebugContext(ctx, "VM resources still exist - not orphaned", - slog.String("vm_id", vm.ID), - ) - return false - } - - // Defense 3: Tracking-based check - look for signs of improper shutdown - if r.hasProperShutdownMarkers(ctx, vm) { - r.logger.DebugContext(ctx, "VM has proper shutdown markers - not orphaned", - slog.String("vm_id", vm.ID), - ) - return false - } - - // All checks passed - this appears to be an orphaned record - r.logger.InfoContext(ctx, "VM identified as orphaned record", - slog.String("vm_id", vm.ID), - slog.Duration("age", shutdownAge), - ) - - return true -} - -// vmResourcesExist checks if VM-related resources still exist (network, storage, etc.) -func (r *VMReconciler) vmResourcesExist(ctx context.Context, vm *database.VM) bool { - // AIDEV-TODO: Implement resource validation checks - // For now, we'll assume resources don't exist if no process is running - // Future enhancements could check: - // - Network namespace existence - // - TAP device existence - // - Storage file existence - // - Jailer chroot directory existence - - return false -} - -// hasProperShutdownMarkers checks for evidence of proper VM shutdown -func (r *VMReconciler) hasProperShutdownMarkers(ctx context.Context, vm *database.VM) bool { - // AIDEV-TODO: Implement shutdown tracking - // For now, we'll assume VMs without proper markers are orphaned - // Future enhancements could check: - // - Shutdown reason metadata - // - Graceful shutdown logs - // - Process exit code tracking - - return false -} - -// deleteOrphanedVM safely deletes an orphaned VM record from the database -func (r *VMReconciler) deleteOrphanedVM(ctx context.Context, vmID string) error { - r.logger.InfoContext(ctx, "deleting orphaned VM record", - slog.String("vm_id", vmID), - ) - - // Use soft delete to maintain audit trail - if err := r.vmRepo.DeleteVMWithContext(ctx, vmID); err != nil { - r.logger.ErrorContext(ctx, "failed to delete orphaned VM", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - return fmt.Errorf("failed to delete orphaned VM %s: %w", vmID, err) - } - - r.logger.InfoContext(ctx, "successfully deleted orphaned VM record", - slog.String("vm_id", vmID), - ) - - return nil -} - -// getRunningFirecrackerProcesses scans for running Firecracker processes -func (r *VMReconciler) getRunningFirecrackerProcesses() (map[string]FirecrackerProcess, error) { - processes := make(map[string]FirecrackerProcess) - - // Use procfs to find Firecracker processes - entries, err := os.ReadDir("/proc") - if err != nil { - return nil, fmt.Errorf("failed to read /proc: %w", err) - } - - for _, entry := range entries { - if !entry.IsDir() { - continue - } - - // Check if directory name is a PID - pid, err := strconv.Atoi(entry.Name()) - if err != nil { - continue - } - - // Read process command line - cmdlinePath := fmt.Sprintf("/proc/%d/cmdline", pid) - cmdlineBytes, err := os.ReadFile(cmdlinePath) - if err != nil { - continue // Process might have disappeared - } - - cmdline := string(cmdlineBytes) - - // Check if this is a Firecracker process - if strings.Contains(cmdline, "firecracker") || strings.Contains(cmdline, "fc_vcpu") { - // Extract VM ID from command line if possible - vmID := r.extractVMIDFromCmdline(cmdline) - - process := FirecrackerProcess{ - PID: pid, - Cmdline: cmdline, - VMID: vmID, - } - - processes[strconv.Itoa(pid)] = process - } - } - - return processes, nil -} - -// extractVMIDFromCmdline attempts to extract VM ID from Firecracker command line -func (r *VMReconciler) extractVMIDFromCmdline(cmdline string) string { - // Look for VM ID patterns in the command line - // This is heuristic-based and may need adjustment - - // Pattern 1: --id vm-id or --id=vm-id - if strings.Contains(cmdline, "--id") { - parts := strings.Fields(strings.ReplaceAll(cmdline, "\x00", " ")) - for i, part := range parts { - if part == "--id" && i+1 < len(parts) { - return parts[i+1] - } - if strings.HasPrefix(part, "--id=") { - return strings.TrimPrefix(part, "--id=") - } - } - } - - // Pattern 2: VM ID in socket path - if strings.Contains(cmdline, "vm-") { - fields := strings.Fields(strings.ReplaceAll(cmdline, "\x00", " ")) - for _, field := range fields { - if strings.Contains(field, "vm-") { - // Extract VM ID from socket path or similar - parts := strings.Split(field, "/") - for _, part := range parts { - if strings.HasPrefix(part, "vm-") { - return part - } - } - } - } - } - - return "" // Could not extract VM ID -} - -// FirecrackerProcess represents a running Firecracker process -type FirecrackerProcess struct { - PID int `json:"pid"` - Cmdline string `json:"cmdline"` - VMID string `json:"vm_id,omitempty"` -} - -// ReconciliationReport contains the results of a reconciliation cycle -type ReconciliationReport struct { - StartTime time.Time `json:"start_time"` - Duration time.Duration `json:"duration"` - DatabaseVMCount int `json:"database_vm_count"` - RunningProcessCount int `json:"running_process_count"` - MarkedDead int `json:"marked_dead"` - StateUpdated int `json:"state_updated"` - OrphansDeleted int `json:"orphans_deleted"` - NoChangeNeeded int `json:"no_change_needed"` - ErrorCount int `json:"error_count"` - VMReports []VMReconciliationReport `json:"vm_reports"` - Errors []string `json:"errors"` -} - -// VMReconciliationReport contains the results for a specific VM -type VMReconciliationReport struct { - VMID string `json:"vm_id"` - DatabaseState metaldv1.VmState `json:"database_state"` - ProcessID string `json:"process_id"` - ProcessExists bool `json:"process_exists"` - ProcessInfo FirecrackerProcess `json:"process_info,omitempty"` - Action ReconcileAction `json:"action"` - NewState metaldv1.VmState `json:"new_state,omitempty"` - Error string `json:"error,omitempty"` -} - -// ReconcileAction represents the action taken during reconciliation -type ReconcileAction string - -const ( - ReconcileActionNoChange ReconcileAction = "no_change" - ReconcileActionMarkDead ReconcileAction = "mark_dead" - ReconcileActionUpdateState ReconcileAction = "update_state" - ReconcileActionDeleteOrphan ReconcileAction = "delete_orphan" - ReconcileActionError ReconcileAction = "error" -) - -// AIDEV-BUSINESS_RULE: Orphaned record cleanup thresholds - conservative to protect customer VMs -const ( - // Only consider VMs orphaned after being shutdown for a very long time - OrphanedRecordAgeThreshold = 7 * 24 * time.Hour // 1 week - conservative - - // Maximum time a VM should reasonably be shutdown before cleanup consideration - MaxReasonableShutdownTime = 30 * 24 * time.Hour // 30 days - very conservative -) diff --git a/go/deploy/metald/internal/service/CLEANUP_BENCHMARKS.md b/go/deploy/metald/internal/service/CLEANUP_BENCHMARKS.md deleted file mode 100644 index fb6da23326..0000000000 --- a/go/deploy/metald/internal/service/CLEANUP_BENCHMARKS.md +++ /dev/null @@ -1,252 +0,0 @@ -# VM Cleanup Performance Benchmarks - -This document describes the performance benchmarks for the `performVMCleanup()` method and how to interpret the results. - -## Overview - -The VM cleanup process is critical for preventing resource leaks when VM creation fails after backend operations succeed. These benchmarks test various scenarios to ensure the cleanup mechanism performs well under load. - -## Running Benchmarks - -### Basic Benchmark Run - -```bash -# Run all cleanup benchmarks -go test -bench=BenchmarkCleanup -benchmem ./internal/service/ - -# Run specific benchmark -go test -bench=BenchmarkCleanupSuccess -benchmem ./internal/service/ - -# Run with verbose output -go test -bench=BenchmarkCleanup -benchmem -v ./internal/service/ -``` - -### Extended Benchmark Run - -```bash -# Run for longer duration to get stable results -go test -bench=BenchmarkCleanup -benchtime=10s -benchmem ./internal/service/ - -# Run with CPU profiling -go test -bench=BenchmarkCleanupConcurrent -cpuprofile=cleanup.prof ./internal/service/ - -# Run with memory profiling -go test -bench=BenchmarkCleanupMemoryUsage -memprofile=cleanup_mem.prof ./internal/service/ -``` - -## Benchmark Scenarios - -### 1. BenchmarkCleanupSuccess -**Purpose**: Tests optimal performance with fast, successful backend operations. -**Conditions**: 10ms backend latency, 0% failure rate -**Key Metrics**: -- Operations per second -- Memory allocations per operation -- Backend calls (should equal number of operations) - -### 2. BenchmarkCleanupWithRetries -**Purpose**: Tests performance when retries are frequently needed. -**Conditions**: 5ms backend latency, 40% failure rate -**Key Metrics**: -- Operations per second (lower than success case) -- Backend calls (should be 1.6x operations due to retries) -- Memory allocations (higher due to retry logic) - -### 3. BenchmarkCleanupConcurrent -**Purpose**: Tests scalability with concurrent cleanup operations. -**Conditions**: Variable concurrency levels (1, 10, 50, 100, 200) -**Key Metrics**: -- Throughput scaling with concurrency -- Maximum concurrent backend calls -- Memory allocation patterns - -### 4. BenchmarkCleanupSlowBackend -**Purpose**: Tests performance with slow backend responses. -**Conditions**: Variable latencies (50ms to 1s) -**Key Metrics**: -- Impact of backend latency on overall performance -- Context timeout behavior -- Resource usage during waiting - -### 5. BenchmarkCleanupContextCancellation -**Purpose**: Tests grace period context functionality. -**Conditions**: Original context cancelled before operation completes -**Key Metrics**: -- Success rate (should be 100% due to grace period) -- Grace period effectiveness -- Resource cleanup behavior - -### 6. BenchmarkCleanupMemoryUsage -**Purpose**: Measures memory allocation patterns in detail. -**Conditions**: Fast operations with some failures -**Key Metrics**: -- Bytes allocated per operation -- Number of allocations per operation -- Memory allocation efficiency - -### 7. BenchmarkCleanupStressTest -**Purpose**: Tests burst scenarios with many concurrent cleanups. -**Conditions**: Burst sizes from 10 to 500 concurrent operations -**Key Metrics**: -- Burst completion time -- Maximum concurrent backend calls -- System resource usage - -### 8. BenchmarkCleanupFailureRecovery -**Purpose**: Tests behavior under total backend failure. -**Conditions**: 100% backend failure rate -**Key Metrics**: -- Retry behavior (should see exactly 3x backend calls) -- Failure detection speed -- Resource cleanup after failures - -## Interpreting Results - -### Sample Output Explanation - -``` -BenchmarkCleanupSuccess-8 1000 1205834 ns/op 328 B/op 12 allocs/op - backend_calls: 1000 - max_concurrent: 8 -``` - -**Breakdown**: -- `1000`: Number of iterations completed -- `1205834 ns/op`: Average time per operation (1.2ms) -- `328 B/op`: Bytes allocated per operation -- `12 allocs/op`: Number of memory allocations per operation -- `backend_calls: 1000`: Total backend calls made -- `max_concurrent: 8`: Maximum concurrent backend operations - -### Performance Targets - -| Metric | Target | Rationale | -|--------|--------|-----------| -| **Successful Cleanup** | < 50ms/op | Fast cleanup prevents request delays | -| **With Retries** | < 150ms/op | 3 retries with backoff should complete quickly | -| **Memory Usage** | < 1KB/op | Low allocation prevents GC pressure | -| **Concurrent Scaling** | Linear to 100 ops | Should scale well on multi-core systems | -| **Failure Recovery** | < 5s total | Quick failure detection and reporting | - -### Warning Signs - -🚨 **Performance Issues to Watch For**: - -1. **High Memory Allocation** - ``` - 10000 B/op 500 allocs/op - ``` - - Indicates potential memory leaks or inefficient allocation patterns - -2. **Poor Concurrency Scaling** - ``` - BenchmarkCleanupConcurrent/concurrency-1-8 1000 1000000 ns/op - BenchmarkCleanupConcurrent/concurrency-100-8 10 100000000 ns/op # 100x slower! - ``` - - Should scale roughly linearly with concurrency - -3. **Excessive Backend Calls** - ``` - backend_calls: 5000 # For 1000 operations - indicates retry storms - ``` - -4. **Context Grace Period Failures** - ``` - BenchmarkCleanupContextCancellation: 50% success rate - ``` - - Should be nearly 100% due to grace period context - -## Performance Analysis Tools - -### CPU Profiling - -```bash -# Generate CPU profile -go test -bench=BenchmarkCleanupConcurrent -cpuprofile=cpu.prof ./internal/service/ - -# Analyze with pprof -go tool pprof cpu.prof -(pprof) top10 -(pprof) web -``` - -### Memory Profiling - -```bash -# Generate memory profile -go test -bench=BenchmarkCleanupMemoryUsage -memprofile=mem.prof ./internal/service/ - -# Analyze memory usage -go tool pprof mem.prof -(pprof) top10 -(pprof) list performVMCleanup -``` - -### Trace Analysis - -```bash -# Generate execution trace -go test -bench=BenchmarkCleanupStressTest -trace=trace.out ./internal/service/ - -# View trace -go tool trace trace.out -``` - -## Production Monitoring - -Based on benchmark results, configure production monitoring: - -### Metrics to Track - -```yaml -# Prometheus metrics -- metald_vm_cleanup_duration_seconds -- metald_vm_cleanup_attempts_total -- metald_vm_cleanup_failures_total -- metald_vm_cleanup_concurrent_operations - -# Alerts -- alert: VMCleanupSlow - expr: histogram_quantile(0.95, metald_vm_cleanup_duration_seconds) > 0.1 - for: 5m - -- alert: VMCleanupHighFailureRate - expr: rate(metald_vm_cleanup_failures_total[5m]) > 0.05 - for: 10m -``` - -### Performance Baselines - -Use benchmark results to establish baselines: - -```bash -# Save baseline performance -go test -bench=BenchmarkCleanup -benchmem ./internal/service/ > baseline.txt - -# Compare against baseline -go test -bench=BenchmarkCleanup -benchmem ./internal/service/ > current.txt -benchcmp baseline.txt current.txt -``` - -## Continuous Integration - -Add performance regression testing: - -```yaml -# .github/workflows/performance.yml -- name: Run Cleanup Benchmarks - run: | - go test -bench=BenchmarkCleanup -benchmem ./internal/service/ > bench.txt - # Store results and compare against previous runs -``` - -## Optimization Guidelines - -Based on benchmark results: - -1. **If memory usage is high**: Look for unnecessary allocations in retry logic -2. **If concurrency doesn't scale**: Check for lock contention or blocking operations -3. **If retries are excessive**: Tune failure detection or backend timeouts -4. **If grace period fails**: Increase timeout or optimize critical path - -These benchmarks provide comprehensive coverage of the cleanup performance characteristics and help ensure the system remains stable under various load conditions. \ No newline at end of file diff --git a/go/deploy/metald/internal/service/auth.go b/go/deploy/metald/internal/service/auth.go index 0bba3c87f4..af9e9b7608 100644 --- a/go/deploy/metald/internal/service/auth.go +++ b/go/deploy/metald/internal/service/auth.go @@ -12,10 +12,10 @@ import ( // CustomerContext holds customer information extracted from authentication type CustomerContext struct { - CustomerID string - TenantID string - UserID string - WorkspaceID string + UserID string + TenantID string + ProjectID string + EnvironmentID string } // AuthenticationInterceptor validates API requests and enforces customer isolation @@ -56,10 +56,14 @@ func AuthenticationInterceptor(logger *slog.Logger) connect.UnaryInterceptorFunc // Extract requested tenant ID from header and validate access requestedTenantID := req.Header().Get("X-Tenant-ID") + requestedProjectID := req.Header().Get("X-Project-ID") + requestedEnvironmentID := req.Header().Get("X-Environment-ID") logger.LogAttrs(ctx, slog.LevelInfo, "checking tenant access", slog.String("procedure", req.Spec().Procedure), - slog.String("user_id", customerCtx.CustomerID), + slog.String("user_id", customerCtx.UserID), slog.String("requested_tenant", requestedTenantID), + slog.String("requested_project", requestedProjectID), + slog.String("requested_environment", requestedEnvironmentID), ) if requestedTenantID != "" { @@ -67,7 +71,7 @@ func AuthenticationInterceptor(logger *slog.Logger) connect.UnaryInterceptorFunc if err := validateTenantAccess(ctx, customerCtx, requestedTenantID); err != nil { logger.LogAttrs(ctx, slog.LevelWarn, "tenant access denied", slog.String("procedure", req.Spec().Procedure), - slog.String("user_id", customerCtx.CustomerID), + slog.String("user_id", customerCtx.UserID), slog.String("requested_tenant", requestedTenantID), slog.String("error", err.Error()), ) @@ -75,7 +79,7 @@ func AuthenticationInterceptor(logger *slog.Logger) connect.UnaryInterceptorFunc } logger.LogAttrs(ctx, slog.LevelInfo, "tenant access granted", slog.String("procedure", req.Spec().Procedure), - slog.String("user_id", customerCtx.CustomerID), + slog.String("user_id", customerCtx.UserID), slog.String("requested_tenant", requestedTenantID), ) } @@ -86,7 +90,7 @@ func AuthenticationInterceptor(logger *slog.Logger) connect.UnaryInterceptorFunc // Log authenticated request logger.LogAttrs(ctx, slog.LevelDebug, "authenticated request", slog.String("procedure", req.Spec().Procedure), - slog.String("customer_id", customerCtx.CustomerID), + slog.String("user_id", customerCtx.UserID), slog.String("tenant_id", customerCtx.TenantID), ) @@ -98,10 +102,8 @@ func AuthenticationInterceptor(logger *slog.Logger) connect.UnaryInterceptorFunc // validateToken validates the API token and returns customer context // TODO: Replace with your actual authentication mechanism (JWT, API keys, etc.) func validateToken(ctx context.Context, token string) (*CustomerContext, error) { - _ = ctx // Will be used for auth service calls in production - // DEVELOPMENT MODE: Extract customer_id from token directly - // Format: "dev_customer_" for development - // Production should validate against your auth service + _ = ctx + // // Development mode: Accept simple bearer tokens if strings.HasPrefix(token, "dev_user_") { @@ -111,25 +113,10 @@ func validateToken(ctx context.Context, token string) (*CustomerContext, error) } return &CustomerContext{ - CustomerID: userID, - TenantID: "", // Tenant determined by X-Tenant-ID header - UserID: userID, - WorkspaceID: "dev_workspace", - }, nil - } - - // Legacy support for old dev_customer_ format - if strings.HasPrefix(token, "dev_customer_") { - customerID := strings.TrimPrefix(token, "dev_customer_") - if customerID == "" { - return nil, fmt.Errorf("invalid development token format") - } - - return &CustomerContext{ - CustomerID: customerID, - TenantID: customerID, // Use customer ID as tenant ID for legacy - UserID: customerID, - WorkspaceID: "dev_workspace", + UserID: userID, + TenantID: "", // Tenant determined by X-Tenant-ID header + ProjectID: userID, + EnvironmentID: userID, }, nil } @@ -142,11 +129,11 @@ func validateToken(ctx context.Context, token string) (*CustomerContext, error) func addCustomerContextToBaggage(ctx context.Context, customerCtx *CustomerContext) context.Context { // Create baggage with customer context bag, err := baggage.Parse(fmt.Sprintf( - "customer_id=%s,tenant_id=%s,user_id=%s,workspace_id=%s", - customerCtx.CustomerID, - customerCtx.TenantID, + "user_id=%s,project_id=%s,tenant_id=%s,environment_id=%s", customerCtx.UserID, - customerCtx.WorkspaceID, + customerCtx.ProjectID, + customerCtx.TenantID, + customerCtx.EnvironmentID, )) if err != nil { // Log error but continue - baggage is for observability, not security @@ -159,15 +146,48 @@ func addCustomerContextToBaggage(ctx context.Context, customerCtx *CustomerConte return baggage.ContextWithBaggage(ctx, bag) } -// ExtractCustomerID extracts customer ID from request context -func ExtractCustomerID(ctx context.Context) (string, error) { +// ExtractTenantID extracts tenant ID from request context +func ExtractTenantID(ctx context.Context) (string, error) { + if requestBaggage := baggage.FromContext(ctx); len(requestBaggage.Members()) > 0 { + tenantID := requestBaggage.Member("tenant_id").Value() + if tenantID != "" { + return tenantID, nil + } + } + return "", fmt.Errorf("tenant_id not found in context") +} + +// ExtractEnvironmentID extracts tenant ID from request context +func ExtractEnvironmentID(ctx context.Context) (string, error) { + if requestBaggage := baggage.FromContext(ctx); len(requestBaggage.Members()) > 0 { + environmentID := requestBaggage.Member("environment_id").Value() + if environmentID != "" { + return environmentID, nil + } + } + return "", fmt.Errorf("environment_id not found in context") +} + +// ExtractProjectID extracts project ID from request context +func ExtractProjectID(ctx context.Context) (string, error) { if requestBaggage := baggage.FromContext(ctx); len(requestBaggage.Members()) > 0 { - customerID := requestBaggage.Member("customer_id").Value() - if customerID != "" { - return customerID, nil + projectID := requestBaggage.Member("project_id").Value() + if projectID != "" { + return projectID, nil } } - return "", fmt.Errorf("customer_id not found in context") + return "", fmt.Errorf("project_id not found in context") +} + +// ExtractUserID extracts tenant ID from request context +func ExtractUserID(ctx context.Context) (string, error) { + if requestBaggage := baggage.FromContext(ctx); len(requestBaggage.Members()) > 0 { + userID := requestBaggage.Member("user_id").Value() + if userID != "" { + return userID, nil + } + } + return "", fmt.Errorf("tenant_id not found in context") } // validateTenantAccess validates that the authenticated user can access the requested tenant @@ -176,7 +196,8 @@ func validateTenantAccess(ctx context.Context, customerCtx *CustomerContext, req // In development mode, allow any authenticated user to access any tenant // TODO: In production, implement proper tenant-user relationship checks - // This should query a tenant membership service or database + // This should query a tenant membership service or database using ctx for timeouts/tracing + _ = ctx // Will be used for database queries in production implementation // For now, basic validation that tenant ID is not empty if requestedTenantID == "" { @@ -185,8 +206,8 @@ func validateTenantAccess(ctx context.Context, customerCtx *CustomerContext, req // Development: Simple access control for demonstration // Block access to "restricted-tenant" unless user is "admin-user" - if requestedTenantID == "restricted-tenant" && customerCtx.CustomerID != "admin-user" { - return fmt.Errorf("access denied: user %s cannot access restricted tenant", customerCtx.CustomerID) + if requestedTenantID == "restricted-tenant" && customerCtx.UserID != "admin-user" { + return fmt.Errorf("access denied: user %s cannot access restricted tenant", customerCtx.UserID) } // In production, this would check: @@ -200,36 +221,3 @@ func validateTenantAccess(ctx context.Context, customerCtx *CustomerContext, req return nil // Allow all other access in development } - -// validateVMOwnership validates that the customer owns the specified VM -func (s *VMService) validateVMOwnership(ctx context.Context, vmID string) error { - // Extract customer ID from authenticated context - customerID, err := ExtractCustomerID(ctx) - if err != nil { - return connect.NewError(connect.CodeUnauthenticated, err) - } - - // Get VM from database - vm, err := s.vmRepo.GetVMWithContext(ctx, vmID) - if err != nil { - s.logger.LogAttrs(ctx, slog.LevelWarn, "vm not found during ownership validation", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - ) - return connect.NewError(connect.CodeNotFound, fmt.Errorf("VM not found: %s", vmID)) - } - - // Validate ownership - if vm.CustomerID != customerID { - s.logger.LogAttrs(ctx, slog.LevelWarn, "SECURITY: unauthorized vm access attempt", - slog.String("vm_id", vmID), - slog.String("requesting_customer", customerID), - slog.String("vm_owner", vm.CustomerID), - slog.String("action", "access_denied"), - ) - return connect.NewError(connect.CodePermissionDenied, - fmt.Errorf("access denied: VM not owned by customer")) - } - - return nil -} diff --git a/go/deploy/metald/internal/service/vm.go b/go/deploy/metald/internal/service/vm.go index f3317ae10d..6de248551b 100644 --- a/go/deploy/metald/internal/service/vm.go +++ b/go/deploy/metald/internal/service/vm.go @@ -5,20 +5,18 @@ import ( "encoding/json" "fmt" "log/slog" - "math" "time" "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" "github.com/unkeyed/unkey/go/deploy/metald/internal/billing" "github.com/unkeyed/unkey/go/deploy/metald/internal/database" "github.com/unkeyed/unkey/go/deploy/metald/internal/observability" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" - "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1/vmprovisionerv1connect" + metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metald/v1" + "github.com/unkeyed/unkey/go/gen/proto/metald/v1/metaldv1connect" "connectrpc.com/connect" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/baggage" "go.opentelemetry.io/otel/trace" ) @@ -28,24 +26,31 @@ type VMService struct { logger *slog.Logger metricsCollector *billing.MetricsCollector vmMetrics *observability.VMMetrics - vmRepo *database.VMRepository tracer trace.Tracer - vmprovisionerv1connect.UnimplementedVmServiceHandler + queries database.Querier + metaldv1connect.UnimplementedVmServiceHandler } // NewVMService creates a new VM service instance -func NewVMService(backend types.Backend, logger *slog.Logger, metricsCollector *billing.MetricsCollector, vmMetrics *observability.VMMetrics, vmRepo *database.VMRepository) *VMService { +func NewVMService(backend types.Backend, logger *slog.Logger, metricsCollector *billing.MetricsCollector, vmMetrics *observability.VMMetrics, queries database.Querier) *VMService { tracer := otel.Tracer("metald.service.vm") return &VMService{ //nolint:exhaustruct // UnimplementedVmServiceHandler is embedded and provides default implementations backend: backend, logger: logger.With("service", "vm"), metricsCollector: metricsCollector, vmMetrics: vmMetrics, - vmRepo: vmRepo, + queries: queries, tracer: tracer, } } +// CreateDeployment allocates a network, generates IDs etc +func (s *VMService) CreateDeployment(ctx context.Context, req *connect.Request[metaldv1.CreateDeploymentRequest]) (*connect.Response[metaldv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&metaldv1.CreateDeploymentResponse{ + VmIds: []string{"a"}, + }), nil +} + // CreateVm creates a new VM instance func (s *VMService) CreateVm(ctx context.Context, req *connect.Request[metaldv1.CreateVmRequest]) (*connect.Response[metaldv1.CreateVmResponse], error) { ctx, span := s.tracer.Start(ctx, "metald.vm.create", @@ -66,11 +71,11 @@ func (s *VMService) CreateVm(ctx context.Context, req *connect.Request[metaldv1. } config := req.Msg.GetConfig() - + // DEBUG: Log full request config for debugging if config != nil { configJSON, _ := json.Marshal(config) - s.logger.LogAttrs(ctx, slog.LevelInfo, "DEBUG: Full VM config received", + s.logger.LogAttrs(ctx, slog.LevelDebug, "full VM config received", slog.String("config_json", string(configJSON)), ) } @@ -84,43 +89,36 @@ func (s *VMService) CreateVm(ctx context.Context, req *connect.Request[metaldv1. return nil, connect.NewError(connect.CodeInvalidArgument, err) } - // Extract authenticated customer ID from context - customerID, err := ExtractCustomerID(ctx) - if err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "missing authenticated customer context") - if s.vmMetrics != nil { - s.vmMetrics.RecordVMCreateFailure(ctx, s.getBackendType(), "missing_customer_context") - } - return nil, connect.NewError(connect.CodeUnauthenticated, fmt.Errorf("customer authentication required")) - } - - // Validate that request customer_id matches authenticated customer (if provided) - if req.Msg.GetCustomerId() != "" && req.Msg.GetCustomerId() != customerID { - s.logger.LogAttrs(ctx, slog.LevelWarn, "SECURITY: customer_id mismatch in request", - slog.String("authenticated_customer", customerID), - slog.String("request_customer", req.Msg.GetCustomerId()), - ) - return nil, connect.NewError(connect.CodePermissionDenied, fmt.Errorf("customer_id mismatch")) - } - // Validate required fields - if validateErr := s.validateVMConfig(config); validateErr != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "invalid vm config", - slog.String("error", validateErr.Error()), + // if validateErr := s.validateVMConfig(config); validateErr != nil { + // s.logger.LogAttrs(ctx, slog.LevelError, "invalid vm config", + // slog.String("error", validateErr.Error()), + // ) + // if s.vmMetrics != nil { + // s.vmMetrics.RecordVMCreateFailure(ctx, s.getBackendType(), "invalid_config") + // } + // return nil, connect.NewError(connect.CodeInvalidArgument, validateErr) + // } + + network, netErr := s.queries.AllocateNetwork(ctx) + if netErr != nil { + s.logger.Info("failed to allocate network", + slog.String("error", netErr.Error()), ) - if s.vmMetrics != nil { - s.vmMetrics.RecordVMCreateFailure(ctx, s.getBackendType(), "invalid_config") - } - return nil, connect.NewError(connect.CodeInvalidArgument, validateErr) + return nil, connect.NewError(connect.CodeInternal, netErr) } - // Add tenant context to logs for audit trail - // AIDEV-NOTE: In multi-tenant systems, all VM operations should be logged with tenant context - s.logWithTenantContext(ctx, slog.LevelInfo, "creating vm", - slog.Int("vcpus", int(config.GetCpu().GetVcpuCount())), - slog.Int64("memory_bytes", config.GetMemory().GetSizeBytes()), + s.logger.Info("network allocated", + slog.String("network_cidr", network.BaseNetwork), ) + ip, ipErr := s.queries.AllocateIP(ctx, database.AllocateIPParams{ + VmID: req.Msg.GetVmId(), + }) + if ipErr != nil { + return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to allocate IP for vm: %w", ipErr)) + } + // Create VM using backend (config is already in unified format) start := time.Now() vmID, err := s.backend.CreateVM(ctx, config) @@ -131,44 +129,15 @@ func (s *VMService) CreateVm(ctx context.Context, req *connect.Request[metaldv1. attribute.String("error.type", "backend_error"), attribute.String("error.message", err.Error()), ) - s.logWithTenantContext(ctx, slog.LevelError, "failed to create vm", - slog.String("error", err.Error()), - ) if s.vmMetrics != nil { s.vmMetrics.RecordVMCreateFailure(ctx, s.getBackendType(), "backend_error") } return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to create vm: %w", err)) } - // Persist VM to database - critical for state consistency - if err := s.vmRepo.CreateVMWithContext(ctx, vmID, customerID, config, metaldv1.VmState_VM_STATE_CREATED); err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "failed to persist vm to database", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - slog.String("error", err.Error()), - ) - - // Attempt robust cleanup with retries to prevent resource leaks - cleanupSuccess := s.performVMCleanup(ctx, vmID, "database_persistence_failure") - if !cleanupSuccess { - // Log critical error - this VM is now orphaned and requires manual intervention - s.logger.LogAttrs(ctx, slog.LevelError, "CRITICAL: vm cleanup failed after database error - orphaned vm detected", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - slog.String("action_required", "manual_cleanup_needed"), - ) - } - - if s.vmMetrics != nil { - s.vmMetrics.RecordVMCreateFailure(ctx, s.getBackendType(), "database_error") - } - return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to persist vm: %w", err)) - } - // Record success attributes span.SetAttributes( attribute.String("vm_id", vmID), - attribute.String("customer_id", customerID), attribute.Int64("duration_ms", duration.Milliseconds()), attribute.Bool("success", true), ) @@ -184,7 +153,10 @@ func (s *VMService) CreateVm(ctx context.Context, req *connect.Request[metaldv1. } return connect.NewResponse(&metaldv1.CreateVmResponse{ - VmId: vmID, + Endpoint: &metaldv1.Endpoint{ + Host: ip.IpAddr, + Port: 35428, + }, State: metaldv1.VmState_VM_STATE_CREATED, }), nil } @@ -211,14 +183,6 @@ func (s *VMService) DeleteVm(ctx context.Context, req *connect.Request[metaldv1. return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - if s.vmMetrics != nil { - s.vmMetrics.RecordVMDeleteFailure(ctx, vmID, s.getBackendType(), "ownership_validation_failed") - } - return nil, err - } - // AIDEV-NOTE: Metrics collection re-enabled - metald now reads from Firecracker stats sockets // Stop metrics collection before deletion if s.metricsCollector != nil { @@ -242,28 +206,6 @@ func (s *VMService) DeleteVm(ctx context.Context, req *connect.Request[metaldv1. return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to delete vm: %w", err)) } - // Soft delete VM in database - required for state consistency - if err := s.vmRepo.DeleteVMWithContext(ctx, vmID); err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "failed to delete vm from database", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - - // Database state consistency is critical - record as partial failure - if s.vmMetrics != nil { - s.vmMetrics.RecordVMDeleteFailure(ctx, vmID, s.getBackendType(), "database_error") - } - - // Log warning about state inconsistency but don't fail the operation - // since backend deletion was successful - s.logger.LogAttrs(ctx, slog.LevelWarn, "vm delete succeeded in backend but failed in database - state inconsistency detected", - slog.String("vm_id", vmID), - slog.String("backend_status", "deleted"), - slog.String("database_status", "active"), - slog.String("action_required", "manual_database_cleanup"), - ) - } - s.logger.LogAttrs(ctx, slog.LevelInfo, "vm deleted successfully", slog.String("vm_id", vmID), slog.Duration("duration", duration), @@ -310,14 +252,6 @@ func (s *VMService) BootVm(ctx context.Context, req *connect.Request[metaldv1.Bo return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - if s.vmMetrics != nil { - s.vmMetrics.RecordVMBootFailure(ctx, vmID, s.getBackendType(), "ownership_validation_failed") - } - return nil, err - } - start := time.Now() err := s.backend.BootVM(ctx, vmID) duration := time.Since(start) @@ -337,41 +271,6 @@ func (s *VMService) BootVm(ctx context.Context, req *connect.Request[metaldv1.Bo return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to boot vm: %w", err)) } - // Update VM state in database - required for state consistency - if err := s.vmRepo.UpdateVMStateWithContext(ctx, vmID, metaldv1.VmState_VM_STATE_RUNNING, nil); err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "failed to update vm state in database", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - - // Log warning about state inconsistency - s.logger.LogAttrs(ctx, slog.LevelWarn, "vm boot succeeded in backend but state update failed in database - state inconsistency detected", - slog.String("vm_id", vmID), - slog.String("backend_status", "running"), - slog.String("database_status", "unknown"), - slog.String("action_required", "manual_state_sync"), - ) - } - - // AIDEV-NOTE: Metrics collection re-enabled - metald now reads from Firecracker stats sockets - // Start metrics collection for billing - if s.metricsCollector != nil { - customerID := s.extractCustomerID(ctx, vmID) - if err := s.metricsCollector.StartCollection(vmID, customerID); err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "failed to start metrics collection", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - slog.String("error", err.Error()), - ) - // Don't fail VM boot if metrics collection fails - } else { - s.logger.LogAttrs(ctx, slog.LevelInfo, "started metrics collection", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - ) - } - } - // Record success attributes span.SetAttributes( attribute.String("vm_id", vmID), @@ -390,8 +289,7 @@ func (s *VMService) BootVm(ctx context.Context, req *connect.Request[metaldv1.Bo } return connect.NewResponse(&metaldv1.BootVmResponse{ - Success: true, - State: metaldv1.VmState_VM_STATE_RUNNING, + State: metaldv1.VmState_VM_STATE_RUNNING, }), nil } @@ -422,14 +320,6 @@ func (s *VMService) ShutdownVm(ctx context.Context, req *connect.Request[metaldv return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - if s.vmMetrics != nil { - s.vmMetrics.RecordVMShutdownFailure(ctx, vmID, s.getBackendType(), force, "ownership_validation_failed") - } - return nil, err - } - // AIDEV-NOTE: Metrics collection re-enabled - metald now reads from Firecracker stats sockets // Stop metrics collection before shutdown if s.metricsCollector != nil { @@ -453,22 +343,6 @@ func (s *VMService) ShutdownVm(ctx context.Context, req *connect.Request[metaldv return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to shutdown vm: %w", err)) } - // Update VM state in database - required for state consistency - if err := s.vmRepo.UpdateVMStateWithContext(ctx, vmID, metaldv1.VmState_VM_STATE_SHUTDOWN, nil); err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "failed to update vm state in database", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - - // Log warning about state inconsistency - s.logger.LogAttrs(ctx, slog.LevelWarn, "vm shutdown succeeded in backend but state update failed in database - state inconsistency detected", - slog.String("vm_id", vmID), - slog.String("backend_status", "shutdown"), - slog.String("database_status", "unknown"), - slog.String("action_required", "manual_state_sync"), - ) - } - s.logger.LogAttrs(ctx, slog.LevelInfo, "vm shutdown successfully", slog.String("vm_id", vmID), slog.Duration("duration", duration), @@ -480,8 +354,7 @@ func (s *VMService) ShutdownVm(ctx context.Context, req *connect.Request[metaldv } return connect.NewResponse(&metaldv1.ShutdownVmResponse{ - Success: true, - State: metaldv1.VmState_VM_STATE_SHUTDOWN, + State: metaldv1.VmState_VM_STATE_SHUTDOWN, }), nil } @@ -499,11 +372,6 @@ func (s *VMService) PauseVm(ctx context.Context, req *connect.Request[metaldv1.P return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - return nil, err - } - if err := s.backend.PauseVM(ctx, vmID); err != nil { s.logger.LogAttrs(ctx, slog.LevelError, "failed to pause vm", slog.String("vm_id", vmID), @@ -517,8 +385,7 @@ func (s *VMService) PauseVm(ctx context.Context, req *connect.Request[metaldv1.P ) return connect.NewResponse(&metaldv1.PauseVmResponse{ - Success: true, - State: metaldv1.VmState_VM_STATE_PAUSED, + State: metaldv1.VmState_VM_STATE_PAUSED, }), nil } @@ -536,11 +403,6 @@ func (s *VMService) ResumeVm(ctx context.Context, req *connect.Request[metaldv1. return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - return nil, err - } - if err := s.backend.ResumeVM(ctx, vmID); err != nil { s.logger.LogAttrs(ctx, slog.LevelError, "failed to resume vm", slog.String("vm_id", vmID), @@ -554,8 +416,7 @@ func (s *VMService) ResumeVm(ctx context.Context, req *connect.Request[metaldv1. ) return connect.NewResponse(&metaldv1.ResumeVmResponse{ - Success: true, - State: metaldv1.VmState_VM_STATE_RUNNING, + State: metaldv1.VmState_VM_STATE_RUNNING, }), nil } @@ -573,11 +434,6 @@ func (s *VMService) RebootVm(ctx context.Context, req *connect.Request[metaldv1. return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - return nil, err - } - if err := s.backend.RebootVM(ctx, vmID); err != nil { s.logger.LogAttrs(ctx, slog.LevelError, "failed to reboot vm", slog.String("vm_id", vmID), @@ -591,8 +447,7 @@ func (s *VMService) RebootVm(ctx context.Context, req *connect.Request[metaldv1. ) return connect.NewResponse(&metaldv1.RebootVmResponse{ - Success: true, - State: metaldv1.VmState_VM_STATE_RUNNING, + State: metaldv1.VmState_VM_STATE_RUNNING, }), nil } @@ -615,11 +470,6 @@ func (s *VMService) GetVmInfo(ctx context.Context, req *connect.Request[metaldv1 return nil, connect.NewError(connect.CodeInvalidArgument, fmt.Errorf("vm_id is required")) } - // Validate customer ownership - if err := s.validateVMOwnership(ctx, vmID); err != nil { - return nil, err - } - info, err := s.backend.GetVMInfo(ctx, vmID) if err != nil { s.logger.LogAttrs(ctx, slog.LevelError, "failed to get vm info", @@ -635,288 +485,71 @@ func (s *VMService) GetVmInfo(ctx context.Context, req *connect.Request[metaldv1 ) return connect.NewResponse(&metaldv1.GetVmInfoResponse{ //nolint:exhaustruct // Metrics and BackendInfo fields are optional and not populated in this response - VmId: vmID, - Config: info.Config, - State: info.State, - NetworkInfo: info.NetworkInfo, - }), nil -} - -// ListVms lists all VMs managed by this service for the authenticated customer -func (s *VMService) ListVms(ctx context.Context, req *connect.Request[metaldv1.ListVmsRequest]) (*connect.Response[metaldv1.ListVmsResponse], error) { - s.logger.LogAttrs(ctx, slog.LevelInfo, "listing vms", - slog.String("method", "ListVms"), - ) - - // Record VM list request metric - if s.vmMetrics != nil { - s.vmMetrics.RecordVMListRequest(ctx, s.getBackendType()) - } - - // Extract authenticated customer ID for filtering - customerID, err := ExtractCustomerID(ctx) - if err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "missing authenticated customer context") - return nil, connect.NewError(connect.CodeUnauthenticated, fmt.Errorf("customer authentication required")) - } - - // Get VMs from database filtered by customer - dbVMs, err := s.vmRepo.ListVMsByCustomerWithContext(ctx, customerID) - if err != nil { - s.logger.LogAttrs(ctx, slog.LevelError, "failed to list vms from database", - slog.String("customer_id", customerID), - slog.String("error", err.Error()), - ) - return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("failed to list vms: %w", err)) - } - - var vms []*metaldv1.VmInfo - // Check for overflow before conversion - if len(dbVMs) > math.MaxInt32 { - s.logger.LogAttrs(ctx, slog.LevelError, "too many VMs to list", - slog.Int("count", len(dbVMs)), - ) - return nil, connect.NewError(connect.CodeResourceExhausted, fmt.Errorf("too many VMs to list: %d", len(dbVMs))) - } - totalCount := int32(len(dbVMs)) //nolint:gosec // Overflow check performed above - - // Convert database VMs to protobuf format - for _, vm := range dbVMs { - vmInfo := &metaldv1.VmInfo{ //nolint:exhaustruct // Optional fields are populated conditionally below based on available data - VmId: vm.ID, - State: vm.State, - CustomerId: vm.CustomerID, - } - - // Add CPU and memory info if available - if vm.ParsedConfig != nil { - if vm.ParsedConfig.GetCpu() != nil { - vmInfo.VcpuCount = vm.ParsedConfig.GetCpu().GetVcpuCount() - } - if vm.ParsedConfig.GetMemory() != nil { - vmInfo.MemorySizeBytes = vm.ParsedConfig.GetMemory().GetSizeBytes() - } - if vm.ParsedConfig.GetMetadata() != nil { - vmInfo.Metadata = vm.ParsedConfig.GetMetadata() - } - } - - // Set timestamps from database - vmInfo.CreatedTimestamp = vm.CreatedAt.Unix() - vmInfo.ModifiedTimestamp = vm.UpdatedAt.Unix() - - vms = append(vms, vmInfo) - } - - s.logger.LogAttrs(ctx, slog.LevelInfo, "vm listing completed", - slog.Int("count", int(totalCount)), - ) - - return connect.NewResponse(&metaldv1.ListVmsResponse{ //nolint:exhaustruct // NextPageToken field not used as pagination is not implemented yet - Vms: vms, - TotalCount: totalCount, + VmId: vmID, + Config: info.Config, + State: info.State, }), nil } // validateVMConfig validates the VM configuration -func (s *VMService) validateVMConfig(config *metaldv1.VmConfig) error { - // AIDEV-BUSINESS_RULE: VM configuration must have CPU, memory, and boot settings - if config.GetCpu() == nil { - return fmt.Errorf("cpu configuration is required") - } - - if config.GetMemory() == nil { - return fmt.Errorf("memory configuration is required") - } - - if config.GetBoot() == nil { - return fmt.Errorf("boot configuration is required") - } - - // Validate CPU configuration - cpu := config.GetCpu() - if cpu.GetVcpuCount() <= 0 { - return fmt.Errorf("vcpu_count must be greater than 0") - } - - if cpu.GetMaxVcpuCount() > 0 && cpu.GetMaxVcpuCount() < cpu.GetVcpuCount() { - return fmt.Errorf("max_vcpu_count must be greater than or equal to vcpu_count") - } - - // Validate memory configuration - memory := config.GetMemory() - if memory.GetSizeBytes() <= 0 { - return fmt.Errorf("memory size_bytes must be greater than 0") - } - - // Validate boot configuration - boot := config.GetBoot() - if boot.GetKernelPath() == "" { - return fmt.Errorf("kernel_path is required") - } - - // Validate storage configuration - ensure at least one storage device exists - if len(config.GetStorage()) == 0 { - return fmt.Errorf("at least one storage device is required") - } - - // Validate that we have a root device - hasRootDevice := false - for i, storage := range config.GetStorage() { - if storage.GetPath() == "" { - return fmt.Errorf("storage device %d path is required", i) - } - if storage.GetIsRootDevice() || i == 0 { - hasRootDevice = true - } - } - if !hasRootDevice { - return fmt.Errorf("at least one storage device must be marked as root device") - } - - return nil -} - -// extractCustomerID extracts the customer ID for billing from VM database record -// Falls back to baggage context and finally to default customer ID -func (s *VMService) extractCustomerID(ctx context.Context, vmID string) string { - // First try to get from database (preferred source) - if vm, err := s.vmRepo.GetVMWithContext(ctx, vmID); err == nil { - s.logger.LogAttrs(ctx, slog.LevelDebug, "extracted customer ID from database", - slog.String("vm_id", vmID), - slog.String("customer_id", vm.CustomerID), - ) - return vm.CustomerID - } else { - s.logger.LogAttrs(ctx, slog.LevelWarn, "failed to get customer ID from database, trying fallback methods", - slog.String("vm_id", vmID), - slog.String("error", err.Error()), - ) - } - - // Fallback to baggage extraction (for compatibility with existing multi-tenant systems) - if requestBaggage := baggage.FromContext(ctx); len(requestBaggage.Members()) > 0 { - if tenantID := requestBaggage.Member("tenant_id").Value(); tenantID != "" { - s.logger.LogAttrs(ctx, slog.LevelDebug, "extracted customer ID from baggage as fallback", - slog.String("vm_id", vmID), - slog.String("customer_id", tenantID), - ) - return tenantID - } - } - - // Final fallback to default customer ID - customerID := "default-customer" - s.logger.LogAttrs(ctx, slog.LevelWarn, "using default customer ID for billing", - slog.String("vm_id", vmID), - slog.String("customer_id", customerID), - ) - - return customerID -} - -// performVMCleanup attempts robust cleanup of a backend VM with retries -// Returns true if cleanup was successful, false if cleanup failed and VM is orphaned -func (s *VMService) performVMCleanup(ctx context.Context, vmID, reason string) bool { - const maxRetries = 3 - const retryDelay = time.Second - const cleanupGracePeriod = 30 * time.Second - - // Create a cleanup context with grace period to ensure critical cleanup completes - // even if the original context is cancelled - cleanupCtx, cancel := context.WithTimeout(context.Background(), cleanupGracePeriod) - defer cancel() - - s.logger.LogAttrs(ctx, slog.LevelInfo, "attempting vm cleanup", - slog.String("vm_id", vmID), - slog.String("reason", reason), - slog.Int("max_retries", maxRetries), - slog.Duration("grace_period", cleanupGracePeriod), - ) - - for attempt := 1; attempt <= maxRetries; attempt++ { - if attempt > 1 { - // Wait before retry using cleanup context - select { - case <-cleanupCtx.Done(): - s.logger.LogAttrs(ctx, slog.LevelError, "vm cleanup cancelled due to grace period timeout", - slog.String("vm_id", vmID), - slog.Int("attempt", attempt), - slog.Duration("grace_period", cleanupGracePeriod), - ) - return false - case <-time.After(retryDelay): - } - } - - s.logger.LogAttrs(ctx, slog.LevelDebug, "attempting vm cleanup", - slog.String("vm_id", vmID), - slog.Int("attempt", attempt), - ) - - if err := s.backend.DeleteVM(cleanupCtx, vmID); err != nil { - s.logger.LogAttrs(ctx, slog.LevelWarn, "vm cleanup attempt failed", - slog.String("vm_id", vmID), - slog.Int("attempt", attempt), - slog.String("error", err.Error()), - ) - - if attempt == maxRetries { - s.logger.LogAttrs(ctx, slog.LevelError, "vm cleanup failed after all retries", - slog.String("vm_id", vmID), - slog.String("final_error", err.Error()), - ) - return false - } - continue - } - - s.logger.LogAttrs(ctx, slog.LevelInfo, "vm cleanup successful", - slog.String("vm_id", vmID), - slog.Int("attempt", attempt), - ) - return true - } - - return false -} - -// logWithTenantContext logs a message with tenant context from baggage for audit trails -// AIDEV-NOTE: Multi-tenant systems require all operations to be logged with tenant context -func (s *VMService) logWithTenantContext(ctx context.Context, level slog.Level, msg string, attrs ...slog.Attr) { - // Extract tenant context from baggage - if requestBaggage := baggage.FromContext(ctx); len(requestBaggage.Members()) > 0 { - tenantID := requestBaggage.Member("tenant_id").Value() - userID := requestBaggage.Member("user_id").Value() - workspaceID := requestBaggage.Member("workspace_id").Value() - - // Add tenant attributes to log - allAttrs := make([]slog.Attr, 0, len(attrs)+3) - if tenantID != "" { - allAttrs = append(allAttrs, slog.String("tenant_id", tenantID)) - } - if userID != "" { - allAttrs = append(allAttrs, slog.String("user_id", userID)) - } - if workspaceID != "" { - allAttrs = append(allAttrs, slog.String("workspace_id", workspaceID)) - } - allAttrs = append(allAttrs, attrs...) - - s.logger.LogAttrs(ctx, level, msg, allAttrs...) - } else { - // Fallback to regular logging if no baggage - s.logger.LogAttrs(ctx, level, msg, attrs...) - } -} +// func (s *VMService) validateVMConfig(config *metaldv1.VmConfig) error { +// if config.GetCpu() == nil { +// return fmt.Errorf("cpu configuration is required") +// } + +// if config.GetMemory() == nil { +// return fmt.Errorf("memory configuration is required") +// } + +// if config.GetBoot() == nil { +// return fmt.Errorf("boot configuration is required") +// } + +// // Validate CPU configuration +// cpu := config.GetCpu() +// if cpu.GetVcpuCount() <= 0 { +// return fmt.Errorf("vcpu_count must be greater than 0") +// } + +// if cpu.GetMaxVcpuCount() > 0 && cpu.GetMaxVcpuCount() < cpu.GetVcpuCount() { +// return fmt.Errorf("max_vcpu_count must be greater than or equal to vcpu_count") +// } + +// // Validate memory configuration +// memory := config.GetMemory() +// if memory.GetSizeBytes() <= 0 { +// return fmt.Errorf("memory size_bytes must be greater than 0") +// } + +// // Validate boot configuration +// boot := config.GetBoot() +// if boot.GetKernelPath() == "" { +// return fmt.Errorf("kernel_path is required") +// } + +// // Validate storage configuration - ensure at least one storage device exists +// if len(config.GetStorage()) == 0 { +// return fmt.Errorf("at least one storage device is required") +// } + +// // Validate that we have a root device +// hasRootDevice := false +// for i, storage := range config.GetStorage() { +// if storage.GetPath() == "" { +// return fmt.Errorf("storage device %d path is required", i) +// } +// if storage.GetIsRootDevice() || i == 0 { +// hasRootDevice = true +// } +// } +// if !hasRootDevice { +// return fmt.Errorf("at least one storage device must be marked as root device") +// } + +// return nil +// } // getBackendType returns the backend type as a string for metrics func (s *VMService) getBackendType() string { - // Try to determine backend type from the backend implementation - switch s.backend.(type) { - case interface{ GetProcessInfo() map[string]interface{} }: - return "firecracker" - default: - return "cloudhypervisor" - } + return "firecracker" } diff --git a/go/deploy/metald/internal/service/vm_cleanup_bench_test.go b/go/deploy/metald/internal/service/vm_cleanup_bench_test.go deleted file mode 100644 index b88040ed11..0000000000 --- a/go/deploy/metald/internal/service/vm_cleanup_bench_test.go +++ /dev/null @@ -1,358 +0,0 @@ -package service - -import ( - "context" - "errors" - "fmt" - "io" - "log/slog" - "math/rand" - "sync" - "sync/atomic" - "testing" - "time" - - "github.com/unkeyed/unkey/go/deploy/metald/internal/backend/types" - metaldv1 "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1" -) - -// Mock backend for cleanup benchmarks -type mockCleanupBackend struct { - deleteLatency time.Duration - failureRate float64 // 0.0 = never fail, 1.0 = always fail - callCount int64 - concurrentCalls int64 - maxConcurrent int64 - mu sync.Mutex - rng *rand.Rand -} - -func (m *mockCleanupBackend) CreateVM(ctx context.Context, config *metaldv1.VmConfig) (string, error) { - return "test-vm", nil -} - -func (m *mockCleanupBackend) DeleteVM(ctx context.Context, vmID string) error { - // Track concurrent calls - current := atomic.AddInt64(&m.concurrentCalls, 1) - defer atomic.AddInt64(&m.concurrentCalls, -1) - - // Update max concurrent if needed - for { - maxConcurrent := atomic.LoadInt64(&m.maxConcurrent) - if current <= maxConcurrent || atomic.CompareAndSwapInt64(&m.maxConcurrent, maxConcurrent, current) { - break - } - } - - // Increment total call count - atomic.AddInt64(&m.callCount, 1) - - // Simulate latency - if m.deleteLatency > 0 { - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(m.deleteLatency): - } - } - - // Simulate failure rate with proper random distribution - if m.failureRate > 0 { - m.mu.Lock() - // Initialize RNG if not already done - if m.rng == nil { - m.rng = rand.New(rand.NewSource(time.Now().UnixNano())) - } - // Generate a random float between 0 and 1 - randomValue := m.rng.Float64() - m.mu.Unlock() - - if randomValue < m.failureRate { - return errors.New("simulated backend failure") - } - } - - return nil -} - -func (m *mockCleanupBackend) BootVM(ctx context.Context, vmID string) error { return nil } -func (m *mockCleanupBackend) ShutdownVM(ctx context.Context, vmID string) error { return nil } -func (m *mockCleanupBackend) ShutdownVMWithOptions(ctx context.Context, vmID string, force bool, timeout int32) error { - return nil -} -func (m *mockCleanupBackend) PauseVM(ctx context.Context, vmID string) error { return nil } -func (m *mockCleanupBackend) ResumeVM(ctx context.Context, vmID string) error { return nil } -func (m *mockCleanupBackend) RebootVM(ctx context.Context, vmID string) error { return nil } -func (m *mockCleanupBackend) GetVMInfo(ctx context.Context, vmID string) (*types.VMInfo, error) { - // Return empty VMInfo for benchmark testing - return &types.VMInfo{}, nil -} -func (m *mockCleanupBackend) GetVMMetrics(ctx context.Context, vmID string) (*types.VMMetrics, error) { - // Return empty metrics for benchmark testing - return &types.VMMetrics{}, nil -} -func (m *mockCleanupBackend) Ping(ctx context.Context) error { return nil } - -func (m *mockCleanupBackend) GetCallCount() int64 { return atomic.LoadInt64(&m.callCount) } -func (m *mockCleanupBackend) GetMaxConcurrent() int64 { return atomic.LoadInt64(&m.maxConcurrent) } -func (m *mockCleanupBackend) Reset() { - atomic.StoreInt64(&m.callCount, 0) - atomic.StoreInt64(&m.concurrentCalls, 0) - atomic.StoreInt64(&m.maxConcurrent, 0) -} - -// createBenchmarkVMService creates a VM service for benchmarking cleanup operations -func createBenchmarkVMService(backend types.Backend) *VMService { - // Use a discarding logger for benchmarks to avoid I/O overhead - logger := slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{ - Level: slog.LevelError, // Only log errors to reduce noise - })) - - // Use nil for optional components in benchmarks - return &VMService{ - backend: backend, - logger: logger, - metricsCollector: nil, - vmMetrics: nil, - vmRepo: nil, - } -} - -// BenchmarkCleanupSuccess tests successful cleanup performance -func BenchmarkCleanupSuccess(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 10 * time.Millisecond, // Realistic backend latency - failureRate: 0.0, // No failures - } - service := createBenchmarkVMService(backend) - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d", i) - success := service.performVMCleanup(ctx, vmID, "benchmark_test") - if !success { - b.Errorf("cleanup failed unexpectedly for vm %s", vmID) - } - } - - b.ReportMetric(float64(backend.GetCallCount()), "backend_calls") - b.ReportMetric(float64(backend.GetMaxConcurrent()), "max_concurrent") -} - -// BenchmarkCleanupWithRetries tests cleanup performance when retries are needed -func BenchmarkCleanupWithRetries(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 5 * time.Millisecond, - failureRate: 0.4, // 40% failure rate to trigger retries - } - service := createBenchmarkVMService(backend) - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d", i) - service.performVMCleanup(ctx, vmID, "benchmark_test_retries") - } - - b.ReportMetric(float64(backend.GetCallCount()), "backend_calls") - b.ReportMetric(float64(backend.GetMaxConcurrent()), "max_concurrent") -} - -// BenchmarkCleanupConcurrent tests concurrent cleanup performance -func BenchmarkCleanupConcurrent(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 20 * time.Millisecond, - failureRate: 0.1, // 10% failure rate - } - service := createBenchmarkVMService(backend) - - concurrencyLevels := []int{1, 10, 50, 100, 200} - - for _, concurrency := range concurrencyLevels { - b.Run(fmt.Sprintf("concurrency-%d", concurrency), func(b *testing.B) { - backend.Reset() - - b.ResetTimer() - b.ReportAllocs() - - b.RunParallel(func(pb *testing.PB) { - i := 0 - for pb.Next() { - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d-%d", concurrency, i) - service.performVMCleanup(ctx, vmID, "benchmark_concurrent") - i++ - } - }) - - b.ReportMetric(float64(backend.GetCallCount()), "backend_calls") - b.ReportMetric(float64(backend.GetMaxConcurrent()), "max_concurrent") - }) - } -} - -// BenchmarkCleanupSlowBackend tests cleanup with slow backend responses -func BenchmarkCleanupSlowBackend(b *testing.B) { - latencies := []time.Duration{ - 50 * time.Millisecond, - 100 * time.Millisecond, - 500 * time.Millisecond, - 1 * time.Second, - } - - for _, latency := range latencies { - b.Run(fmt.Sprintf("latency-%s", latency), func(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: latency, - failureRate: 0.0, - } - service := createBenchmarkVMService(backend) - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d", i) - service.performVMCleanup(ctx, vmID, "benchmark_slow_backend") - } - - b.ReportMetric(float64(backend.GetCallCount()), "backend_calls") - }) - } -} - -// BenchmarkCleanupContextCancellation tests cleanup behavior with context cancellation -func BenchmarkCleanupContextCancellation(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 100 * time.Millisecond, - failureRate: 0.0, - } - service := createBenchmarkVMService(backend) - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - // Create context that cancels after 50ms (before operation completes) - ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) - vmID := fmt.Sprintf("vm-%d", i) - - // This should still succeed due to grace period context - success := service.performVMCleanup(ctx, vmID, "benchmark_cancellation") - cancel() - - // Even with cancelled context, cleanup should succeed due to grace period - if !success { - b.Errorf("cleanup failed for vm %s with cancelled context", vmID) - } - } - - b.ReportMetric(float64(backend.GetCallCount()), "backend_calls") -} - -// BenchmarkCleanupMemoryUsage measures memory allocation patterns -func BenchmarkCleanupMemoryUsage(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 1 * time.Millisecond, - failureRate: 0.2, - } - service := createBenchmarkVMService(backend) - - // Pre-allocate VM IDs to avoid allocation during benchmark - vmIDs := make([]string, b.N) - for i := 0; i < b.N; i++ { - vmIDs[i] = fmt.Sprintf("vm-%d", i) - } - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - ctx := context.Background() - service.performVMCleanup(ctx, vmIDs[i], "benchmark_memory") - } -} - -// BenchmarkCleanupStressTest simulates high-load cleanup scenarios -func BenchmarkCleanupStressTest(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 25 * time.Millisecond, - failureRate: 0.15, // 15% failure rate - } - service := createBenchmarkVMService(backend) - - // Simulate burst cleanup scenarios - burstSizes := []int{10, 50, 100, 500} - - for _, burstSize := range burstSizes { - b.Run(fmt.Sprintf("burst-%d", burstSize), func(b *testing.B) { - backend.Reset() - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - var wg sync.WaitGroup - startTime := time.Now() - - // Launch burst of concurrent cleanups - for j := 0; j < burstSize; j++ { - wg.Add(1) - go func(vmIndex int) { - defer wg.Done() - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d-%d", i, vmIndex) - service.performVMCleanup(ctx, vmID, "benchmark_stress") - }(j) - } - - wg.Wait() - - // Report burst completion time - burstDuration := time.Since(startTime) - b.ReportMetric(float64(burstDuration.Nanoseconds()), "burst_duration_ns") - } - - b.ReportMetric(float64(backend.GetCallCount()), "backend_calls") - b.ReportMetric(float64(backend.GetMaxConcurrent()), "max_concurrent") - }) - } -} - -// BenchmarkCleanupFailureRecovery tests cleanup behavior under total backend failure -func BenchmarkCleanupFailureRecovery(b *testing.B) { - backend := &mockCleanupBackend{ - deleteLatency: 10 * time.Millisecond, - failureRate: 1.0, // 100% failure rate - } - service := createBenchmarkVMService(backend) - - b.ResetTimer() - b.ReportAllocs() - - for i := 0; i < b.N; i++ { - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d", i) - success := service.performVMCleanup(ctx, vmID, "benchmark_failure") - - // Should fail since backend always fails - if success { - b.Errorf("cleanup succeeded unexpectedly for vm %s", vmID) - } - } - - // Should see exactly 3 attempts per VM (3 retries) - expectedCalls := int64(b.N * 3) - actualCalls := backend.GetCallCount() - if actualCalls != expectedCalls { - b.Errorf("expected %d backend calls, got %d", expectedCalls, actualCalls) - } - - b.ReportMetric(float64(actualCalls), "backend_calls") -} diff --git a/go/deploy/metald/internal/service/vm_cleanup_test.go b/go/deploy/metald/internal/service/vm_cleanup_test.go deleted file mode 100644 index 077443e5e0..0000000000 --- a/go/deploy/metald/internal/service/vm_cleanup_test.go +++ /dev/null @@ -1,152 +0,0 @@ -package service - -import ( - "context" - "fmt" - "testing" - "time" -) - -// TestPerformVMCleanupBasic validates the basic cleanup functionality -func TestPerformVMCleanupBasic(t *testing.T) { - backend := &mockCleanupBackend{ - deleteLatency: 1 * time.Millisecond, - failureRate: 0.0, - } - service := createBenchmarkVMService(backend) - - ctx := context.Background() - success := service.performVMCleanup(ctx, "test-vm", "test_reason") - - if !success { - t.Error("cleanup should have succeeded") - } - - if backend.GetCallCount() != 1 { - t.Errorf("expected 1 backend call, got %d", backend.GetCallCount()) - } -} - -// TestPerformVMCleanupWithRetries validates retry logic -func TestPerformVMCleanupWithRetries(t *testing.T) { - backend := &mockCleanupBackend{ - deleteLatency: 1 * time.Millisecond, - failureRate: 0.3, // Lower failure rate to ensure eventual success - } - service := createBenchmarkVMService(backend) - - ctx := context.Background() - - // Run multiple attempts to test retry logic - successCount := 0 - totalAttempts := 10 - - for i := 0; i < totalAttempts; i++ { - backend.Reset() - vmID := fmt.Sprintf("test-vm-%d", i) - success := service.performVMCleanup(ctx, vmID, "test_retries") - if success { - successCount++ - } - } - - // With 30% failure rate and 3 retries, we should see high success rate - successRate := float64(successCount) / float64(totalAttempts) - if successRate < 0.8 { // Expect at least 80% success - t.Errorf("success rate too low: %.2f (expected >= 0.8)", successRate) - } - - t.Logf("Retry test: %d/%d succeeded (%.1f%%)", successCount, totalAttempts, successRate*100) -} - -// TestPerformVMCleanupFailure validates failure handling -func TestPerformVMCleanupFailure(t *testing.T) { - backend := &mockCleanupBackend{ - deleteLatency: 1 * time.Millisecond, - failureRate: 1.0, // Always fail - } - service := createBenchmarkVMService(backend) - - ctx := context.Background() - success := service.performVMCleanup(ctx, "test-vm", "test_failure") - - // Should fail after all retries - if success { - t.Error("cleanup should have failed") - } - - // Should have made 3 attempts - if backend.GetCallCount() != 3 { - t.Errorf("expected 3 backend calls, got %d", backend.GetCallCount()) - } -} - -// TestPerformVMCleanupGracePeriod validates context handling -func TestPerformVMCleanupGracePeriod(t *testing.T) { - backend := &mockCleanupBackend{ - deleteLatency: 50 * time.Millisecond, // Longer than cancellation - failureRate: 0.0, - } - service := createBenchmarkVMService(backend) - - // Create context that cancels quickly - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) - defer cancel() - - success := service.performVMCleanup(ctx, "test-vm", "test_grace_period") - - // Should still succeed due to grace period context - if !success { - t.Error("cleanup should have succeeded despite context cancellation") - } - - if backend.GetCallCount() != 1 { - t.Errorf("expected 1 backend call, got %d", backend.GetCallCount()) - } -} - -// TestPerformVMCleanupConcurrent validates concurrent cleanup safety -func TestPerformVMCleanupConcurrent(t *testing.T) { - backend := &mockCleanupBackend{ - deleteLatency: 10 * time.Millisecond, - failureRate: 0.1, - } - service := createBenchmarkVMService(backend) - - const numGoroutines = 20 - const cleanupPerGoroutine = 5 - - results := make(chan bool, numGoroutines*cleanupPerGoroutine) - - // Launch concurrent cleanups - for i := 0; i < numGoroutines; i++ { - go func(goroutineID int) { - for j := 0; j < cleanupPerGoroutine; j++ { - ctx := context.Background() - vmID := fmt.Sprintf("vm-%d-%d", goroutineID, j) - success := service.performVMCleanup(ctx, vmID, "test_concurrent") - results <- success - } - }(i) - } - - // Collect results - successCount := 0 - totalCount := 0 - for totalCount < numGoroutines*cleanupPerGoroutine { - success := <-results - if success { - successCount++ - } - totalCount++ - } - - // Should have high success rate despite some failures - successRate := float64(successCount) / float64(totalCount) - if successRate < 0.8 { // Allow for some failures due to 10% failure rate - t.Errorf("success rate too low: %.2f", successRate) - } - - t.Logf("Concurrent cleanup test: %d/%d succeeded (%.1f%%), max concurrent: %d", - successCount, totalCount, successRate*100, backend.GetMaxConcurrent()) -} diff --git a/go/deploy/metald/scripts/_schema.sql.preamble b/go/deploy/metald/scripts/_schema.sql.preamble new file mode 100644 index 0000000000..f8320801b8 --- /dev/null +++ b/go/deploy/metald/scripts/_schema.sql.preamble @@ -0,0 +1,37 @@ +-- schema.sql + +CREATE TABLE networks ( + id INTEGER PRIMARY KEY, + base_network TEXT UNIQUE NOT NULL, -- CIDR notation: "10.0.0.16/28" + is_allocated INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE network_allocations ( + id INTEGER PRIMARY KEY, + deployment_id TEXT NOT NULL UNIQUE, + network_id INTEGER NOT NULL, + available_ips TEXT NOT NULL, -- JSON array: ["10.0.0.18","10.0.0.19",...] + allocated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (network_id) REFERENCES networks(id) +); + +CREATE TABLE ip_allocations ( + id INTEGER PRIMARY KEY, + vm_id TEXT NOT NULL UNIQUE, + ip_addr TEXT NOT NULL, + network_allocation_id INTEGER NOT NULL, + allocated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (network_allocation_id) REFERENCES network_allocations(id), + UNIQUE(network_allocation_id, ip_addr) +); + +-- Indexes for performance +CREATE INDEX idx_networks_unallocated + ON networks(is_allocated, id) + WHERE is_allocated = 0; + +CREATE INDEX idx_ip_allocations_network + ON ip_allocations(network_allocation_id); + +CREATE INDEX idx_network_allocations_deployment + ON network_allocations(deployment_id); diff --git a/go/deploy/metald/scripts/generate-schema-and-seed.sh b/go/deploy/metald/scripts/generate-schema-and-seed.sh new file mode 100644 index 0000000000..c3614f2cf4 --- /dev/null +++ b/go/deploy/metald/scripts/generate-schema-and-seed.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -euo pipefail + +BASE=$(dirname $(realpath $0)) + +cat "${BASE}/_schema.sql.preamble" | tee "${BASE}/../sqlc/schema.sql" + +go run "${BASE}/netcalc.go" 10.0.0.0/8 /26 | tee -a "${BASE}/../sqlc/networks-seed.sql" diff --git a/go/deploy/metald/scripts/netcalc.go b/go/deploy/metald/scripts/netcalc.go new file mode 100644 index 0000000000..6acdaf7c2d --- /dev/null +++ b/go/deploy/metald/scripts/netcalc.go @@ -0,0 +1,115 @@ +// generate_seed.go +package main + +import ( + "encoding/binary" + "flag" + "fmt" + "net" + "os" + "strconv" + "strings" +) + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: %s \n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, "Examples:\n") + fmt.Fprintf(os.Stderr, " %s 10.0.0.0/24 /28 # Split a /24 into /28 subnets\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s 192.168.0.0/16 /24 # Split a /16 into /24 subnets\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " %s 10.0.0.0/22 /27 # Split a /22 into /27 subnets\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, "Output can be piped directly to sqlite3:\n") + fmt.Fprintf(os.Stderr, " %s 10.0.0.0/24 /28 | sqlite3 network.db\n", os.Args[0]) + } + + flag.Parse() + + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + rootCIDR := flag.Arg(0) + subnetSize := flag.Arg(1) + + // Parse the root network + _, rootNet, err := net.ParseCIDR(rootCIDR) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Invalid CIDR notation '%s': %v\n", rootCIDR, err) + os.Exit(1) + } + + // Parse desired subnet size (e.g., "/28" -> 28) + subnetSize = strings.TrimPrefix(subnetSize, "/") + newPrefix, err := strconv.Atoi(subnetSize) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: Invalid subnet size '%s'. Use format like '/28' or '28'\n", flag.Arg(1)) + os.Exit(1) + } + + // Validate subnet size + ones, bits := rootNet.Mask.Size() + if newPrefix <= ones { + fmt.Fprintf(os.Stderr, "Error: Subnet /%d must be smaller than root network /%d\n", newPrefix, ones) + os.Exit(1) + } + if newPrefix > 32 { + fmt.Fprintf(os.Stderr, "Error: Invalid subnet size /%d (maximum is /32)\n", newPrefix) + os.Exit(1) + } + + // Calculate how many subnets we'll create + subnetCount := 1 << (newPrefix - ones) + + // Print info as SQL comments + fmt.Printf("-- Generated network seed\n") + fmt.Printf("-- Splitting %s into %d x /%d subnets\n", rootCIDR, subnetCount, newPrefix) + fmt.Printf("-- Each /%d subnet has %d total IPs (%d usable)\n", + newPrefix, + 1<<(32-newPrefix), + (1<<(32-newPrefix))-2) + fmt.Println() + + // Generate the INSERT statement + fmt.Println("INSERT INTO networks (base_network) VALUES") + + subnets := []string{} + for ip := rootNet.IP.Mask(rootNet.Mask); rootNet.Contains(ip); { + subnet := &net.IPNet{ + IP: ip, + Mask: net.CIDRMask(newPrefix, bits), + } + subnets = append(subnets, fmt.Sprintf(" ('%s')", subnet.String())) + + // Move to next subnet + inc := 1 << (bits - newPrefix) + ipInt := ipToInt(ip) + ipInt += uint32(inc) + ip = intToIP(ipInt) + } + + // Output with proper SQL formatting + for i, subnet := range subnets { + if i < len(subnets)-1 { + fmt.Println(subnet + ",") + } else { + fmt.Println(subnet + ";") + } + } +} + +// ipToInt converts an IP address to a 32-bit integer +func ipToInt(ip net.IP) uint32 { + ip = ip.To4() + if ip == nil { + return 0 + } + return binary.BigEndian.Uint32(ip) +} + +// intToIP converts a 32-bit integer to an IP address +func intToIP(n uint32) net.IP { + ip := make(net.IP, 4) + binary.BigEndian.PutUint32(ip, n) + return ip +} diff --git a/go/deploy/metald/scripts/vm-stress-test.sh b/go/deploy/metald/scripts/vm-stress-test.sh new file mode 100755 index 0000000000..e9046b38f6 --- /dev/null +++ b/go/deploy/metald/scripts/vm-stress-test.sh @@ -0,0 +1,582 @@ +#!/bin/bash + +# AIDEV-NOTE: VM lifecycle stress test script for metald +# This script performs comprehensive stress testing of VM creation, boot, shutdown, resume, and deletion +# operations while respecting concurrency limits and tracking VM lifecycle states. + +set -euo pipefail + +# Configuration +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly DOCKER_IMAGE="${DOCKER_IMAGE:-ghcr.io/unkeyed/best-api:v1.1.0}" +readonly MAX_CONCURRENT_VMS=15 +readonly MAX_TOTAL_VMS=500 +readonly CREATION_DELAY_SECONDS=1 +readonly METALD_CLI="${METALD_CLI:-metald-cli}" + +# State tracking +declare -A vm_states=() +declare -a active_vms=() +declare -a all_vms=() +total_created=0 +total_operations=0 +errors=0 + +# Colors for output +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly BLUE='\033[0;34m' +readonly NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $*" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 + ((errors+=1)) +} + +# VM ID parsing function +extract_vm_id() { + local output="$1" + # Extract VM IDs that start with 'ud-' from metald-cli output + echo "$output" | grep -oE 'ud-[a-f0-9]+' | head -1 || true +} + +# Execute metald-cli command with error handling +execute_metald_command() { + local cmd="$*" + log_info "Executing: $cmd" + + local output + local exit_code=0 + + if output=$(timeout 60 $cmd 2>&1); then + log_success "Command succeeded: $cmd" + echo "$output" + return 0 + else + exit_code=$? + log_error "Command failed (exit code: $exit_code): $cmd" + log_error "Output: $output" + return $exit_code + fi +} + +# Get current VM list and parse VM IDs +list_vms() { + local output + if output=$(execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" list); then + echo "$output" | grep -oE 'ud-[a-f0-9]+' || true + fi +} + +# Create and boot a new VM +create_and_boot_vm() { + if [ ${#active_vms[@]} -ge $MAX_CONCURRENT_VMS ]; then + log_warning "Maximum concurrent VMs ($MAX_CONCURRENT_VMS) reached. Skipping creation." + return 1 + fi + + if [ $total_created -ge $MAX_TOTAL_VMS ]; then + log_warning "Maximum total VMs ($MAX_TOTAL_VMS) reached. Skipping creation." + return 1 + fi + + log_info "Creating and booting VM (active: ${#active_vms[@]}, total created: $total_created)" + + local output + if output=$(execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" create-and-boot); then + local vm_id + vm_id=$(extract_vm_id "$output") + + if [ -n "$vm_id" ]; then + active_vms+=("$vm_id") + all_vms+=("$vm_id") + vm_states["$vm_id"]="running" + ((total_created+=1)) + ((total_operations+=1)) + log_success "Created and booted VM: $vm_id" + return 0 + else + log_error "Failed to extract VM ID from create-and-boot output" + return 1 + fi + else + log_error "Failed to create and boot VM" + return 1 + fi +} + +# Get VM info +get_vm_info() { + local vm_id="$1" + log_info "Getting info for VM: $vm_id" + + if execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" info "$vm_id" >/dev/null; then + ((total_operations+=1)) + log_success "Retrieved info for VM: $vm_id" + return 0 + else + log_error "Failed to get info for VM: $vm_id" + return 1 + fi +} + +# Shutdown a VM +shutdown_vm() { + local vm_id="$1" + log_info "Shutting down VM: $vm_id" + + if execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" shutdown "$vm_id" >/dev/null; then + vm_states["$vm_id"]="shutdown" + ((total_operations+=1)) + log_success "Shutdown VM: $vm_id" + return 0 + else + log_error "Failed to shutdown VM: $vm_id" + return 1 + fi +} + +# Resume a VM +resume_vm() { + local vm_id="$1" + log_info "Resuming VM: $vm_id" + + if execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" resume "$vm_id" >/dev/null; then + vm_states["$vm_id"]="running" + ((total_operations+=1)) + log_success "Resumed VM: $vm_id" + return 0 + else + log_error "Failed to resume VM: $vm_id" + return 1 + fi +} + +# Delete a VM +delete_vm() { + local vm_id="$1" + log_info "Deleting VM: $vm_id" + + if execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" delete "$vm_id" >/dev/null; then + # Remove from active VMs array + for i in "${!active_vms[@]}"; do + if [[ "${active_vms[i]}" == "$vm_id" ]]; then + unset "active_vms[i]" + break + fi + done + # Rebuild array to remove gaps + active_vms=("${active_vms[@]}") + + unset vm_states["$vm_id"] + ((total_operations+=1)) + log_success "Deleted VM: $vm_id" + return 0 + else + log_error "Failed to delete VM: $vm_id" + return 1 + fi +} + +# Perform a random VM lifecycle operation +perform_random_operation() { + if [ ${#active_vms[@]} -eq 0 ]; then + create_and_boot_vm + return + fi + + # Choose a random operation + local operations=("info" "shutdown" "resume" "delete") + local operation="${operations[$((RANDOM % ${#operations[@]}))]}" + + # Choose a random VM + local vm_id="${active_vms[$((RANDOM % ${#active_vms[@]}))]}" + local current_state="${vm_states[$vm_id]:-unknown}" + + case "$operation" in + "info") + get_vm_info "$vm_id" + ;; + "shutdown") + if [ "$current_state" = "running" ]; then + shutdown_vm "$vm_id" + else + log_info "VM $vm_id is not running (state: $current_state), skipping shutdown" + fi + ;; + "resume") + if [ "$current_state" = "shutdown" ]; then + resume_vm "$vm_id" + else + log_info "VM $vm_id is not shutdown (state: $current_state), skipping resume" + fi + ;; + "delete") + delete_vm "$vm_id" + ;; + esac +} + +# Clean up all VMs +cleanup_all_vms() { + log_info "Cleaning up all VMs..." + + local cleanup_vms + readarray -t cleanup_vms < <(list_vms) + + for vm_id in "${cleanup_vms[@]}"; do + if [ -n "$vm_id" ]; then + log_info "Cleaning up VM: $vm_id" + # Try to shutdown first, then delete + execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" shutdown "$vm_id" >/dev/null 2>&1 || true + sleep 1 + execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" delete "$vm_id" >/dev/null 2>&1 || true + fi + done + + log_success "Cleanup completed" +} + +# Print current status +print_status() { + log_info "=== VM Stress Test Status ===" + log_info "Active VMs: ${#active_vms[@]}" + log_info "Total VMs created: $total_created" + log_info "Total operations: $total_operations" + log_info "Errors: $errors" + + if [ ${#active_vms[@]} -gt 0 ]; then + log_info "Active VM states:" + for vm_id in "${active_vms[@]}"; do + log_info " $vm_id: ${vm_states[$vm_id]:-unknown}" + done + fi + log_info "==========================" +} + +# Signal handlers +cleanup_and_exit() { + log_warning "Received signal, cleaning up..." + cleanup_all_vms + print_status + exit 0 +} + +# Set up signal handlers +trap cleanup_and_exit SIGINT SIGTERM + +# Test VM persistence across metald restarts +# AIDEV-BUSINESS_RULE: VMs should survive metald service restarts like VMware/VirtualBox +run_persistence_test() { + local test_duration="${1:-120}" # Default 2 minutes per phase + log_info "Starting VM persistence test" + log_info "This test will create VMs, restart metald, and verify VMs persist" + + # Phase 1: Create and manage some VMs + log_info "Phase 1: Creating test VMs" + local test_vms=() + local shutdown_vms=() + local running_vms=() + + # Create 5 VMs for testing + for i in {1..5}; do + log_info "Creating test VM $i/5" + local output + if output=$(execute_metald_command $METALD_CLI -docker-image="$DOCKER_IMAGE" create-and-boot); then + local vm_id + vm_id=$(extract_vm_id "$output") + if [ -n "$vm_id" ]; then + test_vms+=("$vm_id") + running_vms+=("$vm_id") + log_success "Created test VM: $vm_id" + sleep 2 + fi + else + log_error "Failed to create test VM $i" + fi + done + + if [ ${#test_vms[@]} -eq 0 ]; then + log_error "No test VMs created, aborting persistence test" + return 1 + fi + + # Shutdown half of the VMs to test SHUTDOWN persistence + local shutdown_count=$((${#test_vms[@]} / 2)) + log_info "Phase 2: Shutting down $shutdown_count VMs for persistence testing" + + for ((i=0; i/dev/null 2>&1; then + log_error "metald-cli not found at: $METALD_CLI" + log_error "Please install metald-cli or set METALD_CLI environment variable" + exit 1 + fi + + if [ "$cleanup_only" = true ]; then + cleanup_all_vms + exit 0 + fi + + if [ "$status_only" = true ]; then + local existing_vms + readarray -t existing_vms < <(list_vms) + log_info "Current VMs:" + for vm_id in "${existing_vms[@]}"; do + if [ -n "$vm_id" ]; then + get_vm_info "$vm_id" || true + fi + done + exit 0 + fi + + if [ "$persistence_test" = true ]; then + run_persistence_test + exit $? + fi + + run_stress_test "$duration" +} + +# Run main function with all arguments +main "$@" diff --git a/go/deploy/metald/sqlc/queries.sql b/go/deploy/metald/sqlc/queries.sql new file mode 100644 index 0000000000..fd5d0eb31b --- /dev/null +++ b/go/deploy/metald/sqlc/queries.sql @@ -0,0 +1,71 @@ +-- queries.sql + +-- name: AllocateNetwork :one +UPDATE networks +SET is_allocated = 1 +WHERE id = ( + SELECT id FROM networks + WHERE is_allocated = 0 + ORDER BY id + LIMIT 1 +) +RETURNING *; + +-- name: CreateNetworkAllocation :one +INSERT INTO network_allocations (deployment_id, network_id, available_ips) +VALUES (?, ?, ?) +RETURNING *; + +-- name: GetNetworkAllocation :one +SELECT na.*, n.base_network +FROM network_allocations na +JOIN networks n ON na.network_id = n.id +WHERE na.deployment_id = ?; + +-- name: PopAvailableIPJSON :one +UPDATE network_allocations +SET available_ips = json_remove(available_ips, '$[0]') +WHERE deployment_id = ? +AND json_array_length(available_ips) > 0 +RETURNING json_extract(available_ips, '$[0]') as allocated_ip, id; + +-- name: AllocateIP :one +INSERT INTO ip_allocations (vm_id, ip_addr, network_allocation_id) +VALUES (?, ?, ?) +RETURNING *; + +-- name: GetIPAllocation :one +SELECT * FROM ip_allocations WHERE vm_id = ?; + +-- name: ReleaseIP :exec +DELETE FROM ip_allocations WHERE vm_id = ?; + +-- name: ReturnIPJSON :exec +UPDATE network_allocations +SET available_ips = json_insert(available_ips, '$[#]', ?) +WHERE deployment_id = ?; + +-- name: GetAvailableIPCount :one +SELECT json_array_length(available_ips) as count +FROM network_allocations +WHERE deployment_id = ?; + +-- name: ReleaseNetwork :exec +UPDATE networks +SET is_allocated = 0 +WHERE id = ?; + +-- name: DeleteIPAllocationsForNetwork :exec +DELETE FROM ip_allocations +WHERE network_allocation_id = ?; + +-- name: DeleteNetworkAllocation :exec +DELETE FROM network_allocations +WHERE deployment_id = ?; + +-- name: GetNetworkStats :one +SELECT + (SELECT COUNT(*) FROM networks) as total_networks, + (SELECT COUNT(*) FROM networks WHERE is_allocated = 0) as available_networks, + (SELECT COUNT(*) FROM network_allocations) as active_deployments, + (SELECT COUNT(*) FROM ip_allocations) as allocated_ips; diff --git a/go/deploy/metald/sqlc/schema.sql b/go/deploy/metald/sqlc/schema.sql new file mode 100644 index 0000000000..f8320801b8 --- /dev/null +++ b/go/deploy/metald/sqlc/schema.sql @@ -0,0 +1,37 @@ +-- schema.sql + +CREATE TABLE networks ( + id INTEGER PRIMARY KEY, + base_network TEXT UNIQUE NOT NULL, -- CIDR notation: "10.0.0.16/28" + is_allocated INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE network_allocations ( + id INTEGER PRIMARY KEY, + deployment_id TEXT NOT NULL UNIQUE, + network_id INTEGER NOT NULL, + available_ips TEXT NOT NULL, -- JSON array: ["10.0.0.18","10.0.0.19",...] + allocated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (network_id) REFERENCES networks(id) +); + +CREATE TABLE ip_allocations ( + id INTEGER PRIMARY KEY, + vm_id TEXT NOT NULL UNIQUE, + ip_addr TEXT NOT NULL, + network_allocation_id INTEGER NOT NULL, + allocated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (network_allocation_id) REFERENCES network_allocations(id), + UNIQUE(network_allocation_id, ip_addr) +); + +-- Indexes for performance +CREATE INDEX idx_networks_unallocated + ON networks(is_allocated, id) + WHERE is_allocated = 0; + +CREATE INDEX idx_ip_allocations_network + ON ip_allocations(network_allocation_id); + +CREATE INDEX idx_network_allocations_deployment + ON network_allocations(deployment_id); diff --git a/go/deploy/metald/sqlc/sqlc.yaml b/go/deploy/metald/sqlc/sqlc.yaml new file mode 100644 index 0000000000..4699e80a79 --- /dev/null +++ b/go/deploy/metald/sqlc/sqlc.yaml @@ -0,0 +1,12 @@ +version: "2" +sql: + - engine: "sqlite" + queries: "queries.sql" + schema: "schema.sql" + gen: + go: + package: "database" + out: "../internal/database" + emit_db_tags: true + emit_json_tags: true + emit_interface: true diff --git a/go/deploy/pkg/observability/interceptors/client.go b/go/deploy/pkg/observability/interceptors/client.go index d4117b07cf..4778a37f5a 100644 --- a/go/deploy/pkg/observability/interceptors/client.go +++ b/go/deploy/pkg/observability/interceptors/client.go @@ -57,8 +57,11 @@ func NewClientTenantForwardingInterceptor(logger *slog.Logger) connect.UnaryInte if tenantCtx.TenantID != "" { req.Header().Set("X-Tenant-ID", tenantCtx.TenantID) } - if tenantCtx.CustomerID != "" { - req.Header().Set("X-Customer-ID", tenantCtx.CustomerID) + if tenantCtx.ProjectID != "" { + req.Header().Set("X-Project-ID", tenantCtx.ProjectID) + } + if tenantCtx.EnvironmentID != "" { + req.Header().Set("X-Environment-ID", tenantCtx.EnvironmentID) } if tenantCtx.AuthToken != "" { req.Header().Set("Authorization", tenantCtx.AuthToken) @@ -66,7 +69,8 @@ func NewClientTenantForwardingInterceptor(logger *slog.Logger) connect.UnaryInte logger.LogAttrs(ctx, slog.LevelDebug, "forwarding tenant context", slog.String("tenant_id", tenantCtx.TenantID), - slog.String("customer_id", tenantCtx.CustomerID), + slog.String("project_id", tenantCtx.ProjectID), + slog.String("environment_id", tenantCtx.EnvironmentID), slog.String("procedure", req.Spec().Procedure), ) } @@ -105,7 +109,8 @@ func NewClientMetricsInterceptor(serviceName string, logger *slog.Logger) connec if tenantCtx, ok := TenantFromContext(ctx); ok && tenantCtx.TenantID != "" { span.SetAttributes( attribute.String("tenant.id", tenantCtx.TenantID), - attribute.String("tenant.customer_id", tenantCtx.CustomerID), + attribute.String("tenant.project_id", tenantCtx.ProjectID), + attribute.String("tenant.environment_id", tenantCtx.EnvironmentID), ) } diff --git a/go/deploy/pkg/observability/interceptors/logging.go b/go/deploy/pkg/observability/interceptors/logging.go index 073163d77b..2dbbb8e5a3 100644 --- a/go/deploy/pkg/observability/interceptors/logging.go +++ b/go/deploy/pkg/observability/interceptors/logging.go @@ -15,8 +15,6 @@ import ( // NewLoggingInterceptor creates a ConnectRPC interceptor that provides structured logging // for all RPC calls, including request/response details, timing, and error information. -// -// AIDEV-NOTE: This interceptor provides consistent logging across all Unkey services func NewLoggingInterceptor(opts ...Option) connect.UnaryInterceptorFunc { options := applyOptions(opts) @@ -28,7 +26,6 @@ func NewLoggingInterceptor(opts ...Option) connect.UnaryInterceptorFunc { return func(next connect.UnaryFunc) connect.UnaryFunc { return func(ctx context.Context, req connect.AnyRequest) (resp connect.AnyResponse, err error) { - // AIDEV-NOTE: Panic recovery in logging interceptor for defense in depth // Preserves existing errors and logs panic details for debugging defer func() { if r := recover(); r != nil { @@ -66,20 +63,15 @@ func NewLoggingInterceptor(opts ...Option) connect.UnaryInterceptorFunc { slog.String("procedure", procedure), slog.String("method", methodName), slog.String("protocol", req.Peer().Protocol), - slog.String("peer_addr", req.Peer().Addr), slog.String("trace_id", traceID), } - // Add user agent if present - if userAgent := req.Header().Get("User-Agent"); userAgent != "" { - requestAttrs = append(requestAttrs, slog.String("user_agent", userAgent)) - } - // Add tenant info if available if tenantCtx, ok := TenantFromContext(ctx); ok && tenantCtx.TenantID != "" { requestAttrs = append(requestAttrs, slog.String("tenant_id", tenantCtx.TenantID), - slog.String("customer_id", tenantCtx.CustomerID), + // slog.String("project_id", tenantCtx.ProjectID), + // slog.String("environment_id", tenantCtx.EnvironmentID), ) } @@ -90,7 +82,7 @@ func NewLoggingInterceptor(opts ...Option) connect.UnaryInterceptorFunc { func() { defer func() { if r := recover(); r != nil { - err = connect.NewError(connect.CodeInternal, fmt.Errorf("handler panic: %v", r)) + err = connect.NewError(connect.CodeInternal, fmt.Errorf("logging handler panic: %v", r)) } }() resp, err = next(ctx, req) @@ -111,6 +103,8 @@ func NewLoggingInterceptor(opts ...Option) connect.UnaryInterceptorFunc { if tenantCtx, ok := TenantFromContext(ctx); ok && tenantCtx.TenantID != "" { responseAttrs = append(responseAttrs, slog.String("tenant_id", tenantCtx.TenantID), + // slog.String("project_id", tenantCtx.ProjectID), + // slog.String("environment_id", tenantCtx.EnvironmentID), ) } diff --git a/go/deploy/pkg/observability/interceptors/metrics.go b/go/deploy/pkg/observability/interceptors/metrics.go index 0d98d8478e..58b158a671 100644 --- a/go/deploy/pkg/observability/interceptors/metrics.go +++ b/go/deploy/pkg/observability/interceptors/metrics.go @@ -127,7 +127,8 @@ func NewMetricsInterceptor(opts ...Option) connect.UnaryInterceptorFunc { if tenantCtx, ok := TenantFromContext(ctx); ok && tenantCtx.TenantID != "" { span.SetAttributes( attribute.String("tenant.id", tenantCtx.TenantID), - attribute.String("tenant.customer_id", tenantCtx.CustomerID), + // attribute.String("tenant.project_id", tenantCtx.ProjectID), + // attribute.String("tenant.environment_id", tenantCtx.EnvironmentID), ) } @@ -181,7 +182,7 @@ func NewMetricsInterceptor(opts ...Option) connect.UnaryInterceptorFunc { func() { defer func() { if r := recover(); r != nil { - err = connect.NewError(connect.CodeInternal, fmt.Errorf("handler panic: %v", r)) + err = connect.NewError(connect.CodeInternal, fmt.Errorf("metrics handler panic: %v", r)) span.RecordError(err) } }() diff --git a/go/deploy/pkg/observability/interceptors/tenant.go b/go/deploy/pkg/observability/interceptors/tenant.go index 311f079a12..8995676b94 100644 --- a/go/deploy/pkg/observability/interceptors/tenant.go +++ b/go/deploy/pkg/observability/interceptors/tenant.go @@ -13,10 +13,14 @@ import ( // TenantContext holds tenant authentication information extracted from request headers. type TenantContext struct { - // TenantID is the unique identifier for the tenant. + // TenantID is the unique identifier for the tenant/customer. TenantID string - // CustomerID is the unique identifier for the customer. - CustomerID string + // ProjectID is the unique identifier for the project. + ProjectID string + // EnvironmentID is the environment within the project + EnvironmentID string + // UserID is the user ID making the request + UserID string // AuthToken is the authentication token provided in the request. AuthToken string } @@ -24,7 +28,12 @@ type TenantContext struct { // contextKey is a private type for context keys to avoid collisions. type contextKey string -const tenantContextKey contextKey = "tenant_auth" +const ( + tenantContextKey contextKey = "tenant_id" + projectContextKey contextKey = "project_id" + environmentContextKey contextKey = "environment_id" + userContextKey contextKey = "user_id" +) // WithTenantContext adds tenant authentication context to the context. func WithTenantContext(ctx context.Context, auth TenantContext) context.Context { @@ -48,7 +57,7 @@ func NewTenantAuthInterceptor(opts ...Option) connect.UnaryInterceptorFunc { return func(next connect.UnaryFunc) connect.UnaryFunc { return func(ctx context.Context, req connect.AnyRequest) (resp connect.AnyResponse, err error) { - // AIDEV-NOTE: Panic recovery in tenant auth interceptor prevents auth failures from crashing the service + // Panic recovery in tenant auth interceptor prevents auth failures from crashing the service defer func() { if r := recover(); r != nil { if options.Logger != nil { @@ -68,7 +77,9 @@ func NewTenantAuthInterceptor(opts ...Option) connect.UnaryInterceptorFunc { // Extract tenant information from headers tenantID := req.Header().Get("X-Tenant-ID") - customerID := req.Header().Get("X-Customer-ID") + projectID := req.Header().Get("X-Project-ID") + environmentID := req.Header().Get("X-Environment-ID") + userID := req.Header().Get("X-User-ID") authToken := req.Header().Get("Authorization") // Log request with tenant info if logger is available @@ -77,16 +88,20 @@ func NewTenantAuthInterceptor(opts ...Option) connect.UnaryInterceptorFunc { slog.String("service", options.ServiceName), slog.String("procedure", req.Spec().Procedure), slog.String("tenant_id", tenantID), - slog.String("customer_id", customerID), + slog.String("project_id", projectID), + slog.String("environment_id", environmentID), + slog.String("user_id", userID), slog.Bool("has_auth_token", authToken != ""), ) } // Add tenant context to the request context tenantCtx := TenantContext{ - TenantID: tenantID, - CustomerID: customerID, - AuthToken: authToken, + TenantID: tenantID, + ProjectID: projectID, + EnvironmentID: environmentID, + UserID: userID, + AuthToken: authToken, } ctx = WithTenantContext(ctx, tenantCtx) @@ -94,7 +109,9 @@ func NewTenantAuthInterceptor(opts ...Option) connect.UnaryInterceptorFunc { if span := trace.SpanFromContext(ctx); span.SpanContext().IsValid() { span.SetAttributes( attribute.String("tenant.id", tenantID), - attribute.String("tenant.customer_id", customerID), + attribute.String("tenant.project_id", projectID), + attribute.String("tenant.environment_id", environmentID), + attribute.String("user.id", userID), attribute.Bool("tenant.authenticated", tenantID != ""), ) } @@ -114,22 +131,20 @@ func NewTenantAuthInterceptor(opts ...Option) connect.UnaryInterceptorFunc { } } + // THIS IS PROBABLY WHERE UNKEY OR SOMETHING DOES AUTH LOL + // Log successful tenant authentication if options.Logger != nil && tenantID != "" { options.Logger.LogAttrs(ctx, slog.LevelDebug, "tenant authenticated", slog.String("service", options.ServiceName), slog.String("tenant_id", tenantID), - slog.String("customer_id", customerID), + slog.String("project_id", projectID), + slog.String("environment_id", environmentID), + slog.String("user_id", userID), slog.String("procedure", req.Spec().Procedure), ) } - // AIDEV-TODO: Add actual token validation logic here when auth service is available - // This would involve: - // 1. Validating the auth token with an auth service - // 2. Checking tenant permissions for the requested procedure - // 3. Potentially caching validation results for performance - return next(ctx, req) } } diff --git a/go/deploy/random.go b/go/deploy/random.go new file mode 100644 index 0000000000..012515e603 --- /dev/null +++ b/go/deploy/random.go @@ -0,0 +1,106 @@ +package main + +import ( + "fmt" + "hash/fnv" + "math" +) + +// Original function with issues +func CalculateNetworkMapIDOriginal(workspaceID string) uint32 { + hash := fnv.New32a() + hash.Write([]byte(workspaceID)) + return uint32(int(hash.Sum32()) % int(math.Pow(2, (28-12)))) +} + +// Improved version - returns 0 to 65535 +func CalculateNetworkMapID(workspaceID string) uint32 { + hash := fnv.New32a() + hash.Write([]byte(workspaceID)) + return hash.Sum32() % 65536 // or % (1 << 16) +} + +// If you need 1-65536 instead of 0-65535 +func CalculateNetworkMapIDOneBased(workspaceID string) uint32 { + hash := fnv.New32a() + hash.Write([]byte(workspaceID)) + return (hash.Sum32() % 65536) + 1 +} + +// Even more efficient using bitwise operations +func CalculateNetworkMapIDFast(workspaceID string) uint32 { + hash := fnv.New32a() + hash.Write([]byte(workspaceID)) + return hash.Sum32() & 0xFFFF // Masks to keep only lower 16 bits (0-65535) +} + +// If you want to directly map to a subnet index (0-based) +func GetSubnetIndex(workspaceID string) uint32 { + hash := fnv.New32a() + hash.Write([]byte(workspaceID)) + return hash.Sum32() & 0xFFFF // 0-65535 +} + +// Get the actual subnet CIDR for a workspace +func GetSubnetForWorkspace(workspaceID string) string { + index := GetSubnetIndex(workspaceID) + + // Each /28 subnet has 16 IPs + // Calculate which subnet this index maps to + subnetNumber := index * 16 + + // Calculate octets for 172.16.0.0/12 base + // We have 20 bits to work with (32-12) + // The first 4 bits determine the second octet (16-31) + // The next 8 bits determine the third octet (0-255) + // The last 8 bits determine the fourth octet (0-255) + + totalOffset := subnetNumber + octet4 := totalOffset % 256 + octet3 := (totalOffset / 256) % 256 + octet2 := 16 + (totalOffset / 65536) + + return fmt.Sprintf("172.%d.%d.%d/28", octet2, octet3, octet4) +} + +func main() { + testIDs := []string{ + "workspace-123", + "workspace-456", + "workspace-789", + "test", + "production", + } + + fmt.Println("Workspace ID -> Subnet Mapping") + fmt.Println("--------------------------------") + + for _, id := range testIDs { + index := GetSubnetIndex(id) + subnet := GetSubnetForWorkspace(id) + fmt.Printf("%-15s -> Index: %5d, Subnet: %s\n", id, index, subnet) + } + + // Test distribution + fmt.Println("\n--- Distribution Test ---") + testDistribution() +} + +// Test the distribution quality of the hash function +func testDistribution() { + buckets := make(map[uint32]int) + numTests := 100000 + + for i := 0; i < numTests; i++ { + workspaceID := fmt.Sprintf("workspace-%d", i) + index := GetSubnetIndex(workspaceID) + bucketIndex := index / 6554 // Divide into 10 buckets + buckets[bucketIndex]++ + } + + fmt.Printf("Distribution across %d samples:\n", numTests) + for i := uint32(0); i < 10; i++ { + percentage := float64(buckets[i]) / float64(numTests) * 100 + fmt.Printf("Bucket %d: %.2f%%\n", i, percentage) + } +} diff --git a/go/deploy/scripts/check-system-readiness.sh b/go/deploy/scripts/check-system-readiness.sh deleted file mode 100755 index adfa0d72c6..0000000000 --- a/go/deploy/scripts/check-system-readiness.sh +++ /dev/null @@ -1,329 +0,0 @@ -#!/bin/bash -# AIDEV-BUSINESS_RULE: System readiness check for deploying Unkey services on Fedora 42 or Ubuntu -# This script checks for all prerequisites before service installation - -set -euo pipefail - -# Color codes for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# Tracking variables -ERRORS=0 -WARNINGS=0 - -# Detect OS -detect_os() { - if [ -f /etc/os-release ]; then - . /etc/os-release - OS=$NAME - VER=$VERSION_ID - else - echo -e "${RED}Cannot detect OS. /etc/os-release not found.${NC}" - exit 1 - fi -} - -# Print check result -check_result() { - local check_name=$1 - local result=$2 - local message=$3 - - if [ "$result" -eq 0 ]; then - echo -e "${GREEN}✓${NC} $check_name: $message" - else - echo -e "${RED}✗${NC} $check_name: $message" - ((ERRORS++)) - fi -} - -# Print warning -check_warning() { - local check_name=$1 - local message=$2 - echo -e "${YELLOW}⚠${NC} $check_name: $message" - ((WARNINGS++)) -} - -# Check if running as root or with sudo -check_sudo() { - if [ "$EUID" -ne 0 ] && ! sudo -n true 2>/dev/null; then - check_result "Sudo Access" 1 "Script must be run as root or with sudo privileges" - else - check_result "Sudo Access" 0 "Sufficient privileges available" - fi -} - -# Check systemd -check_systemd() { - if command -v systemctl &> /dev/null && systemctl --version &> /dev/null; then - check_result "systemd" 0 "systemd is installed" - else - check_result "systemd" 1 "systemd is required but not found" - fi -} - -# Check Go version -check_go() { - if command -v go &> /dev/null; then - GO_VERSION=$(go version | awk '{print $3}' | sed 's/go//') - REQUIRED_VERSION="1.24" - - if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$GO_VERSION" | sort -V | head -n1)" = "$REQUIRED_VERSION" ]; then - check_result "Go Version" 0 "Go $GO_VERSION installed (requires >= $REQUIRED_VERSION)" - else - check_result "Go Version" 1 "Go $GO_VERSION installed but version >= $REQUIRED_VERSION required" - fi - else - check_result "Go Version" 1 "Go is not installed (requires >= 1.24)" - fi -} - -# Check Make -check_make() { - if command -v make &> /dev/null; then - check_result "Make" 0 "Make is installed" - else - check_result "Make" 1 "Make is required but not found" - fi -} - -# Check Git -check_git() { - if command -v git &> /dev/null; then - check_result "Git" 0 "Git is installed" - else - check_result "Git" 1 "Git is required but not found" - fi -} - -# Check Docker/Podman (for builderd and observability) -check_container_runtime() { - local docker_found=false - local podman_found=false - - if command -v docker &> /dev/null; then - docker_found=true - if docker info &> /dev/null; then - check_result "Docker" 0 "Docker is installed and running" - # Check for docker compose - if docker compose version &> /dev/null; then - check_result "Docker Compose" 0 "Docker Compose plugin is available" - else - check_warning "Docker Compose" "Docker Compose plugin not found (required for SPIRE quickstart)" - fi - else - check_warning "Docker" "Docker is installed but not running or accessible" - fi - fi - - if command -v podman &> /dev/null; then - podman_found=true - if podman info &> /dev/null; then - check_result "Podman" 0 "Podman is installed and running" - else - check_warning "Podman" "Podman is installed but not running or accessible" - fi - fi - - if [ "$docker_found" = false ] && [ "$podman_found" = false ]; then - check_warning "Container Runtime" "Neither Docker nor Podman found (required for builderd service and observability stack)" - fi -} - -# Check Firecracker (for metald) -check_firecracker() { - local fc_found=false - - if command -v firecracker &> /dev/null; then - echo "nope!!" - fc_found=true - check_result "Firecracker" 0 "Firecracker is installed" - fi -} - -# Check KVM support -check_kvm() { - if [ -e /dev/kvm ]; then - if [ -r /dev/kvm ] && [ -w /dev/kvm ]; then - check_result "KVM" 0 "KVM is available and accessible" - else - check_warning "KVM" "KVM exists but may not be accessible to current user (required for metald)" - fi - else - check_warning "KVM" "/dev/kvm not found - virtualization may not be enabled (required for metald)" - fi -} - -# Check required tools for the build process -check_build_tools() { - local tools=("curl" "wget" "tar" "gzip") - local missing=() - - for tool in "${tools[@]}"; do - if ! command -v "$tool" &> /dev/null; then - missing+=("$tool") - fi - done - - if [ ${#missing[@]} -eq 0 ]; then - check_result "Build Tools" 0 "All build tools are installed" - else - check_result "Build Tools" 1 "Missing tools: ${missing[*]}" - fi -} - -# Check buf for protobuf generation -check_buf() { - if command -v buf &> /dev/null; then - check_result "Buf" 0 "Buf is installed ($(buf --version))" - else - check_result "Buf" 1 "Buf is required for protobuf generation but not found" - echo " To install buf:" - echo " # Using the install script (recommended):" - echo " curl -sSL https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64 -o /tmp/buf" - echo " sudo install -m 755 /tmp/buf /usr/local/bin/buf" - echo "" - echo " # Or via Go:" - echo " go install github.com/bufbuild/buf/cmd/buf@latest" - fi -} - -# Check disk space (at least 5GB free) -check_disk_space() { - AVAILABLE_SPACE=$(df -BG . | awk 'NR==2 {print $4}' | sed 's/G//') - if [ "$AVAILABLE_SPACE" -ge 5 ]; then - check_result "Disk Space" 0 "${AVAILABLE_SPACE}GB available (requires >= 5GB)" - else - check_result "Disk Space" 1 "${AVAILABLE_SPACE}GB available (requires >= 5GB)" - fi -} - -# Check network connectivity -check_network() { - if ping -c 1 -W 2 github.com &> /dev/null; then - check_result "Network" 0 "Network connectivity confirmed" - else - check_warning "Network" "Cannot reach github.com - network issues may prevent dependency downloads" - fi -} - -# Check for conflicting services -check_port_availability() { - local ports=("8080" "8081" "8082" "8083" "9464" "9465" "9466") - local conflicts=() - - for port in "${ports[@]}"; do - if ss -tlnp 2>/dev/null | grep -q ":$port "; then - conflicts+=("$port") - fi - done - - if [ ${#conflicts[@]} -eq 0 ]; then - check_result "Port Availability" 0 "All required ports are available" - else - check_warning "Port Availability" "Ports already in use: ${conflicts[*]}" - fi -} - -# Check cgroup version -check_cgroup_version() { - if [ -f /sys/fs/cgroup/cgroup.controllers ]; then - check_result "Cgroup" 0 "cgroup v2 is active" - else - check_result "Cgroup" 1 "cgroup v2 is required but not active" - echo " To enable cgroup v2:" - echo " sudo grubby --update-kernel=ALL --args='systemd.unified_cgroup_hierarchy=1'" - echo " Then reboot your system" - fi -} - -# Main execution -main() { - echo "===================================" - echo "Unkey Services System Readiness Check" - echo "===================================" - echo - - detect_os - echo "Detected OS: $OS $VER" - echo - - # Verify supported OS - case "$OS" in - "Fedora Linux") - if [ "$VER" -lt 40 ]; then - check_warning "OS Version" "Fedora $VER detected. Fedora 42 or later recommended" - else - check_result "OS Version" 0 "Fedora $VER is supported" - fi - ;; - "Ubuntu") - if [ "${VER%%.*}" -lt 22 ]; then - check_warning "OS Version" "Ubuntu $VER detected. Ubuntu 22.04 or later recommended" - else - check_result "OS Version" 0 "Ubuntu $VER is supported" - fi - ;; - *) - check_warning "OS Version" "$OS is not officially tested. Fedora 42 or Ubuntu 22.04+ recommended" - ;; - esac - - echo - echo "Checking system requirements..." - echo "--------------------------------" - - # Core requirements - check_sudo - check_systemd - check_go - check_make - check_git - check_buf - check_build_tools - check_disk_space - check_network - - echo - echo "Checking service-specific requirements..." - echo "-----------------------------------------" - - # Service-specific requirements - check_container_runtime - check_firecracker - check_kvm - check_cgroup_version - check_port_availability - - echo - echo "===================================" - echo "Summary:" - echo "-----------------------------------" - - if [ $ERRORS -eq 0 ]; then - if [ $WARNINGS -eq 0 ]; then - echo -e "${GREEN}✓ System is ready for deployment!${NC}" - echo "All requirements are met." - else - echo -e "${GREEN}✓ System meets minimum requirements.${NC}" - echo -e "${YELLOW} $WARNINGS warning(s) found - some services may have limited functionality.${NC}" - fi - echo - echo "You can proceed with the installation." - exit 0 - else - echo -e "${RED}✗ System is not ready for deployment.${NC}" - echo " $ERRORS error(s) found that must be resolved." - [ $WARNINGS -gt 0 ] && echo " $WARNINGS warning(s) found." - echo - echo "Please resolve the errors before proceeding." - exit 1 - fi -} - -# Run main function -main "$@" diff --git a/go/deploy/scripts/format-go.sh b/go/deploy/scripts/format-go.sh index 2bb120f07d..7b36318b6b 100755 --- a/go/deploy/scripts/format-go.sh +++ b/go/deploy/scripts/format-go.sh @@ -25,7 +25,7 @@ echo "Processing Go file: ${file_path}" # Step 1: Format with gofmt echo "Running gofmt..." -if gofmt -w "${file_path}"; then +if gofumpt -w "${file_path}"; then echo "✓ gofmt completed" else echo "✗ gofmt failed" diff --git a/go/deploy/scripts/install-buf.sh b/go/deploy/scripts/install-buf.sh index af22228101..b09b1b66c7 100755 --- a/go/deploy/scripts/install-buf.sh +++ b/go/deploy/scripts/install-buf.sh @@ -5,7 +5,7 @@ set -euo pipefail # Configuration -BUF_VERSION="${BUF_VERSION:-v1.55.1}" +BUF_VERSION="${BUF_VERSION:-v1.57.0}" ARCH="${ARCH:-$(uname -m)}" OS="${OS:-$(uname -s)}" INSTALL_DIR="/usr/local/bin" diff --git a/go/deploy/scripts/install-firecracker.sh b/go/deploy/scripts/install-firecracker.sh index 3919fc77a7..3da2282b27 100755 --- a/go/deploy/scripts/install-firecracker.sh +++ b/go/deploy/scripts/install-firecracker.sh @@ -1,11 +1,10 @@ #!/bin/bash # Install or uninstall Firecracker and Jailer from GitHub releases -# AIDEV-NOTE: Installs both firecracker and jailer binaries which are required for production deployments set -euo pipefail # Configuration -FIRECRACKER_VERSION="${FIRECRACKER_VERSION:-v1.12.1}" +FIRECRACKER_VERSION="${FIRECRACKER_VERSION:-v1.13.0}" ARCH="${ARCH:-x86_64}" INSTALL_DIR="/usr/local/bin" diff --git a/go/deploy/spire/Makefile b/go/deploy/spire/Makefile index 82f4236858..8237c8352a 100644 --- a/go/deploy/spire/Makefile +++ b/go/deploy/spire/Makefile @@ -2,7 +2,7 @@ # Installs SPIRE server and agent as systemd services # Variables -SPIRE_VERSION ?= 1.12.2 +SPIRE_VERSION ?= 1.13.0 SPIRE_ARCH ?= linux-amd64-musl SPIRE_URL = https://github.com/spiffe/spire/releases/download/v$(SPIRE_VERSION)/spire-$(SPIRE_VERSION)-$(SPIRE_ARCH).tar.gz SPIRE_INSTALL_DIR = /opt/spire @@ -264,4 +264,4 @@ uninstall-server: ## Uninstall SPIRE server @sudo rm -f /etc/systemd/system/spire-server.service @sudo rm -f $(SPIRE_INSTALL_DIR)/bin/spire-server @sudo systemctl daemon-reload - @echo "✓ SPIRE server uninstalled (data preserved)" \ No newline at end of file + @echo "✓ SPIRE server uninstalled (data preserved)" diff --git a/go/gen/proto/metal/vmprovisioner/v1/vmprovisioner.pb.go b/go/gen/proto/metal/vmprovisioner/v1/vmprovisioner.pb.go new file mode 100644 index 0000000000..f349f0dc39 --- /dev/null +++ b/go/gen/proto/metal/vmprovisioner/v1/vmprovisioner.pb.go @@ -0,0 +1,2864 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.7 +// protoc (unknown) +// source: proto/metal/vmprovisioner/v1/vmprovisioner.proto + +package vmprovisionerv1 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// VM lifecycle states +type VmState int32 + +const ( + VmState_VM_STATE_UNSPECIFIED VmState = 0 + VmState_VM_STATE_CREATED VmState = 1 + VmState_VM_STATE_RUNNING VmState = 2 + VmState_VM_STATE_PAUSED VmState = 3 + VmState_VM_STATE_SHUTDOWN VmState = 4 +) + +// Enum value maps for VmState. +var ( + VmState_name = map[int32]string{ + 0: "VM_STATE_UNSPECIFIED", + 1: "VM_STATE_CREATED", + 2: "VM_STATE_RUNNING", + 3: "VM_STATE_PAUSED", + 4: "VM_STATE_SHUTDOWN", + } + VmState_value = map[string]int32{ + "VM_STATE_UNSPECIFIED": 0, + "VM_STATE_CREATED": 1, + "VM_STATE_RUNNING": 2, + "VM_STATE_PAUSED": 3, + "VM_STATE_SHUTDOWN": 4, + } +) + +func (x VmState) Enum() *VmState { + p := new(VmState) + *p = x + return p +} + +func (x VmState) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (VmState) Descriptor() protoreflect.EnumDescriptor { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_enumTypes[0].Descriptor() +} + +func (VmState) Type() protoreflect.EnumType { + return &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_enumTypes[0] +} + +func (x VmState) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use VmState.Descriptor instead. +func (VmState) EnumDescriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{0} +} + +// Network mode for the interface +type NetworkMode int32 + +const ( + NetworkMode_NETWORK_MODE_UNSPECIFIED NetworkMode = 0 + NetworkMode_NETWORK_MODE_DUAL_STACK NetworkMode = 1 // Both IPv4 and IPv6 + NetworkMode_NETWORK_MODE_IPV4_ONLY NetworkMode = 2 // IPv4 only + NetworkMode_NETWORK_MODE_IPV6_ONLY NetworkMode = 3 // IPv6 only +) + +// Enum value maps for NetworkMode. +var ( + NetworkMode_name = map[int32]string{ + 0: "NETWORK_MODE_UNSPECIFIED", + 1: "NETWORK_MODE_DUAL_STACK", + 2: "NETWORK_MODE_IPV4_ONLY", + 3: "NETWORK_MODE_IPV6_ONLY", + } + NetworkMode_value = map[string]int32{ + "NETWORK_MODE_UNSPECIFIED": 0, + "NETWORK_MODE_DUAL_STACK": 1, + "NETWORK_MODE_IPV4_ONLY": 2, + "NETWORK_MODE_IPV6_ONLY": 3, + } +) + +func (x NetworkMode) Enum() *NetworkMode { + p := new(NetworkMode) + *p = x + return p +} + +func (x NetworkMode) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (NetworkMode) Descriptor() protoreflect.EnumDescriptor { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_enumTypes[1].Descriptor() +} + +func (NetworkMode) Type() protoreflect.EnumType { + return &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_enumTypes[1] +} + +func (x NetworkMode) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use NetworkMode.Descriptor instead. +func (NetworkMode) EnumDescriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{1} +} + +// Unified VM configuration that works across different hypervisors +type VmConfig struct { + state protoimpl.MessageState `protogen:"open.v1"` + // CPU configuration + Cpu *CpuConfig `protobuf:"bytes,1,opt,name=cpu,proto3" json:"cpu,omitempty"` + // Memory configuration + Memory *MemoryConfig `protobuf:"bytes,2,opt,name=memory,proto3" json:"memory,omitempty"` + // Boot configuration + Boot *BootConfig `protobuf:"bytes,3,opt,name=boot,proto3" json:"boot,omitempty"` + // Storage devices + Storage []*StorageDevice `protobuf:"bytes,4,rep,name=storage,proto3" json:"storage,omitempty"` + // Network interfaces + Network []*NetworkInterface `protobuf:"bytes,5,rep,name=network,proto3" json:"network,omitempty"` + // Console configuration + Console *ConsoleConfig `protobuf:"bytes,6,opt,name=console,proto3" json:"console,omitempty"` + // Metadata and labels + Metadata map[string]string `protobuf:"bytes,7,rep,name=metadata,proto3" json:"metadata,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *VmConfig) Reset() { + *x = VmConfig{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *VmConfig) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*VmConfig) ProtoMessage() {} + +func (x *VmConfig) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use VmConfig.ProtoReflect.Descriptor instead. +func (*VmConfig) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{0} +} + +func (x *VmConfig) GetCpu() *CpuConfig { + if x != nil { + return x.Cpu + } + return nil +} + +func (x *VmConfig) GetMemory() *MemoryConfig { + if x != nil { + return x.Memory + } + return nil +} + +func (x *VmConfig) GetBoot() *BootConfig { + if x != nil { + return x.Boot + } + return nil +} + +func (x *VmConfig) GetStorage() []*StorageDevice { + if x != nil { + return x.Storage + } + return nil +} + +func (x *VmConfig) GetNetwork() []*NetworkInterface { + if x != nil { + return x.Network + } + return nil +} + +func (x *VmConfig) GetConsole() *ConsoleConfig { + if x != nil { + return x.Console + } + return nil +} + +func (x *VmConfig) GetMetadata() map[string]string { + if x != nil { + return x.Metadata + } + return nil +} + +type CpuConfig struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Number of virtual CPUs to allocate at boot + VcpuCount int32 `protobuf:"varint,1,opt,name=vcpu_count,json=vcpuCount,proto3" json:"vcpu_count,omitempty"` + // Maximum number of virtual CPUs (for hotplug) + MaxVcpuCount int32 `protobuf:"varint,2,opt,name=max_vcpu_count,json=maxVcpuCount,proto3" json:"max_vcpu_count,omitempty"` + // CPU topology (optional) + Topology *CpuTopology `protobuf:"bytes,3,opt,name=topology,proto3" json:"topology,omitempty"` + // CPU features and model (backend-specific) + Features map[string]string `protobuf:"bytes,4,rep,name=features,proto3" json:"features,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CpuConfig) Reset() { + *x = CpuConfig{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CpuConfig) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CpuConfig) ProtoMessage() {} + +func (x *CpuConfig) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CpuConfig.ProtoReflect.Descriptor instead. +func (*CpuConfig) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{1} +} + +func (x *CpuConfig) GetVcpuCount() int32 { + if x != nil { + return x.VcpuCount + } + return 0 +} + +func (x *CpuConfig) GetMaxVcpuCount() int32 { + if x != nil { + return x.MaxVcpuCount + } + return 0 +} + +func (x *CpuConfig) GetTopology() *CpuTopology { + if x != nil { + return x.Topology + } + return nil +} + +func (x *CpuConfig) GetFeatures() map[string]string { + if x != nil { + return x.Features + } + return nil +} + +type CpuTopology struct { + state protoimpl.MessageState `protogen:"open.v1"` + Sockets int32 `protobuf:"varint,1,opt,name=sockets,proto3" json:"sockets,omitempty"` + CoresPerSocket int32 `protobuf:"varint,2,opt,name=cores_per_socket,json=coresPerSocket,proto3" json:"cores_per_socket,omitempty"` + ThreadsPerCore int32 `protobuf:"varint,3,opt,name=threads_per_core,json=threadsPerCore,proto3" json:"threads_per_core,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CpuTopology) Reset() { + *x = CpuTopology{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CpuTopology) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CpuTopology) ProtoMessage() {} + +func (x *CpuTopology) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CpuTopology.ProtoReflect.Descriptor instead. +func (*CpuTopology) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{2} +} + +func (x *CpuTopology) GetSockets() int32 { + if x != nil { + return x.Sockets + } + return 0 +} + +func (x *CpuTopology) GetCoresPerSocket() int32 { + if x != nil { + return x.CoresPerSocket + } + return 0 +} + +func (x *CpuTopology) GetThreadsPerCore() int32 { + if x != nil { + return x.ThreadsPerCore + } + return 0 +} + +type MemoryConfig struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Memory size in bytes + SizeBytes int64 `protobuf:"varint,1,opt,name=size_bytes,json=sizeBytes,proto3" json:"size_bytes,omitempty"` + // Whether memory hotplug is enabled + HotplugEnabled bool `protobuf:"varint,2,opt,name=hotplug_enabled,json=hotplugEnabled,proto3" json:"hotplug_enabled,omitempty"` + // Maximum memory size for hotplug (bytes) + MaxSizeBytes int64 `protobuf:"varint,3,opt,name=max_size_bytes,json=maxSizeBytes,proto3" json:"max_size_bytes,omitempty"` + // Memory backing options (hugepages, etc.) + Backing map[string]string `protobuf:"bytes,4,rep,name=backing,proto3" json:"backing,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *MemoryConfig) Reset() { + *x = MemoryConfig{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *MemoryConfig) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*MemoryConfig) ProtoMessage() {} + +func (x *MemoryConfig) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use MemoryConfig.ProtoReflect.Descriptor instead. +func (*MemoryConfig) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{3} +} + +func (x *MemoryConfig) GetSizeBytes() int64 { + if x != nil { + return x.SizeBytes + } + return 0 +} + +func (x *MemoryConfig) GetHotplugEnabled() bool { + if x != nil { + return x.HotplugEnabled + } + return false +} + +func (x *MemoryConfig) GetMaxSizeBytes() int64 { + if x != nil { + return x.MaxSizeBytes + } + return 0 +} + +func (x *MemoryConfig) GetBacking() map[string]string { + if x != nil { + return x.Backing + } + return nil +} + +type BootConfig struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Path to kernel image + KernelPath string `protobuf:"bytes,1,opt,name=kernel_path,json=kernelPath,proto3" json:"kernel_path,omitempty"` + // Path to initial ramdisk (optional) + InitrdPath string `protobuf:"bytes,2,opt,name=initrd_path,json=initrdPath,proto3" json:"initrd_path,omitempty"` + // Kernel command line arguments + KernelArgs string `protobuf:"bytes,3,opt,name=kernel_args,json=kernelArgs,proto3" json:"kernel_args,omitempty"` + // Boot order and options + BootOptions map[string]string `protobuf:"bytes,4,rep,name=boot_options,json=bootOptions,proto3" json:"boot_options,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BootConfig) Reset() { + *x = BootConfig{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BootConfig) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BootConfig) ProtoMessage() {} + +func (x *BootConfig) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BootConfig.ProtoReflect.Descriptor instead. +func (*BootConfig) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{4} +} + +func (x *BootConfig) GetKernelPath() string { + if x != nil { + return x.KernelPath + } + return "" +} + +func (x *BootConfig) GetInitrdPath() string { + if x != nil { + return x.InitrdPath + } + return "" +} + +func (x *BootConfig) GetKernelArgs() string { + if x != nil { + return x.KernelArgs + } + return "" +} + +func (x *BootConfig) GetBootOptions() map[string]string { + if x != nil { + return x.BootOptions + } + return nil +} + +type StorageDevice struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Unique identifier for this storage device + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // Path to the backing file or block device + Path string `protobuf:"bytes,2,opt,name=path,proto3" json:"path,omitempty"` + // Whether this device is read-only + ReadOnly bool `protobuf:"varint,3,opt,name=read_only,json=readOnly,proto3" json:"read_only,omitempty"` + // Whether this is the root/boot device + IsRootDevice bool `protobuf:"varint,4,opt,name=is_root_device,json=isRootDevice,proto3" json:"is_root_device,omitempty"` + // Storage interface type (virtio-blk, nvme, etc.) + InterfaceType string `protobuf:"bytes,5,opt,name=interface_type,json=interfaceType,proto3" json:"interface_type,omitempty"` + // Additional storage options + Options map[string]string `protobuf:"bytes,6,rep,name=options,proto3" json:"options,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *StorageDevice) Reset() { + *x = StorageDevice{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *StorageDevice) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*StorageDevice) ProtoMessage() {} + +func (x *StorageDevice) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use StorageDevice.ProtoReflect.Descriptor instead. +func (*StorageDevice) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{5} +} + +func (x *StorageDevice) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *StorageDevice) GetPath() string { + if x != nil { + return x.Path + } + return "" +} + +func (x *StorageDevice) GetReadOnly() bool { + if x != nil { + return x.ReadOnly + } + return false +} + +func (x *StorageDevice) GetIsRootDevice() bool { + if x != nil { + return x.IsRootDevice + } + return false +} + +func (x *StorageDevice) GetInterfaceType() string { + if x != nil { + return x.InterfaceType + } + return "" +} + +func (x *StorageDevice) GetOptions() map[string]string { + if x != nil { + return x.Options + } + return nil +} + +type NetworkInterface struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Unique identifier for this network interface + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + // MAC address (optional, will be generated if not provided) + MacAddress string `protobuf:"bytes,2,opt,name=mac_address,json=macAddress,proto3" json:"mac_address,omitempty"` + // Host-side TAP device name + TapDevice string `protobuf:"bytes,3,opt,name=tap_device,json=tapDevice,proto3" json:"tap_device,omitempty"` + // Network interface type (virtio-net, e1000, etc.) + InterfaceType string `protobuf:"bytes,4,opt,name=interface_type,json=interfaceType,proto3" json:"interface_type,omitempty"` + // Additional network options + Options map[string]string `protobuf:"bytes,5,rep,name=options,proto3" json:"options,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + // IPv4 configuration (optional) + Ipv4Config *IPv4Config `protobuf:"bytes,6,opt,name=ipv4_config,json=ipv4Config,proto3" json:"ipv4_config,omitempty"` + // IPv6 configuration (optional) + Ipv6Config *IPv6Config `protobuf:"bytes,7,opt,name=ipv6_config,json=ipv6Config,proto3" json:"ipv6_config,omitempty"` + // Network mode + Mode NetworkMode `protobuf:"varint,8,opt,name=mode,proto3,enum=metal.vmprovisioner.v1.NetworkMode" json:"mode,omitempty"` + // Rate limiting + RxRateLimit *RateLimit `protobuf:"bytes,10,opt,name=rx_rate_limit,json=rxRateLimit,proto3" json:"rx_rate_limit,omitempty"` // Receive rate limit + TxRateLimit *RateLimit `protobuf:"bytes,11,opt,name=tx_rate_limit,json=txRateLimit,proto3" json:"tx_rate_limit,omitempty"` // Transmit rate limit + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *NetworkInterface) Reset() { + *x = NetworkInterface{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *NetworkInterface) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NetworkInterface) ProtoMessage() {} + +func (x *NetworkInterface) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[6] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NetworkInterface.ProtoReflect.Descriptor instead. +func (*NetworkInterface) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{6} +} + +func (x *NetworkInterface) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *NetworkInterface) GetMacAddress() string { + if x != nil { + return x.MacAddress + } + return "" +} + +func (x *NetworkInterface) GetTapDevice() string { + if x != nil { + return x.TapDevice + } + return "" +} + +func (x *NetworkInterface) GetInterfaceType() string { + if x != nil { + return x.InterfaceType + } + return "" +} + +func (x *NetworkInterface) GetOptions() map[string]string { + if x != nil { + return x.Options + } + return nil +} + +func (x *NetworkInterface) GetIpv4Config() *IPv4Config { + if x != nil { + return x.Ipv4Config + } + return nil +} + +func (x *NetworkInterface) GetIpv6Config() *IPv6Config { + if x != nil { + return x.Ipv6Config + } + return nil +} + +func (x *NetworkInterface) GetMode() NetworkMode { + if x != nil { + return x.Mode + } + return NetworkMode_NETWORK_MODE_UNSPECIFIED +} + +func (x *NetworkInterface) GetRxRateLimit() *RateLimit { + if x != nil { + return x.RxRateLimit + } + return nil +} + +func (x *NetworkInterface) GetTxRateLimit() *RateLimit { + if x != nil { + return x.TxRateLimit + } + return nil +} + +// IPv4 network configuration +type IPv4Config struct { + state protoimpl.MessageState `protogen:"open.v1"` + Address string `protobuf:"bytes,1,opt,name=address,proto3" json:"address,omitempty"` // IPv4 address (e.g., "10.100.1.2") + Netmask string `protobuf:"bytes,2,opt,name=netmask,proto3" json:"netmask,omitempty"` // Network mask (e.g., "255.255.255.0") + Gateway string `protobuf:"bytes,3,opt,name=gateway,proto3" json:"gateway,omitempty"` // Default gateway + DnsServers []string `protobuf:"bytes,4,rep,name=dns_servers,json=dnsServers,proto3" json:"dns_servers,omitempty"` // DNS servers + Dhcp bool `protobuf:"varint,5,opt,name=dhcp,proto3" json:"dhcp,omitempty"` // Use DHCP instead of static config + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IPv4Config) Reset() { + *x = IPv4Config{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IPv4Config) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IPv4Config) ProtoMessage() {} + +func (x *IPv4Config) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[7] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IPv4Config.ProtoReflect.Descriptor instead. +func (*IPv4Config) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{7} +} + +func (x *IPv4Config) GetAddress() string { + if x != nil { + return x.Address + } + return "" +} + +func (x *IPv4Config) GetNetmask() string { + if x != nil { + return x.Netmask + } + return "" +} + +func (x *IPv4Config) GetGateway() string { + if x != nil { + return x.Gateway + } + return "" +} + +func (x *IPv4Config) GetDnsServers() []string { + if x != nil { + return x.DnsServers + } + return nil +} + +func (x *IPv4Config) GetDhcp() bool { + if x != nil { + return x.Dhcp + } + return false +} + +// IPv6 network configuration +type IPv6Config struct { + state protoimpl.MessageState `protogen:"open.v1"` + Address string `protobuf:"bytes,1,opt,name=address,proto3" json:"address,omitempty"` // IPv6 address (e.g., "fd00::1:2") + PrefixLength int32 `protobuf:"varint,2,opt,name=prefix_length,json=prefixLength,proto3" json:"prefix_length,omitempty"` // Prefix length (e.g., 64) + Gateway string `protobuf:"bytes,3,opt,name=gateway,proto3" json:"gateway,omitempty"` // Default gateway + DnsServers []string `protobuf:"bytes,4,rep,name=dns_servers,json=dnsServers,proto3" json:"dns_servers,omitempty"` // DNS servers (IPv6 addresses) + Slaac bool `protobuf:"varint,5,opt,name=slaac,proto3" json:"slaac,omitempty"` // Use SLAAC (Stateless Address Autoconfiguration) + PrivacyExtensions bool `protobuf:"varint,6,opt,name=privacy_extensions,json=privacyExtensions,proto3" json:"privacy_extensions,omitempty"` // Enable privacy extensions + LinkLocal string `protobuf:"bytes,7,opt,name=link_local,json=linkLocal,proto3" json:"link_local,omitempty"` // Link-local address (auto-generated if empty) + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IPv6Config) Reset() { + *x = IPv6Config{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IPv6Config) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IPv6Config) ProtoMessage() {} + +func (x *IPv6Config) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[8] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IPv6Config.ProtoReflect.Descriptor instead. +func (*IPv6Config) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{8} +} + +func (x *IPv6Config) GetAddress() string { + if x != nil { + return x.Address + } + return "" +} + +func (x *IPv6Config) GetPrefixLength() int32 { + if x != nil { + return x.PrefixLength + } + return 0 +} + +func (x *IPv6Config) GetGateway() string { + if x != nil { + return x.Gateway + } + return "" +} + +func (x *IPv6Config) GetDnsServers() []string { + if x != nil { + return x.DnsServers + } + return nil +} + +func (x *IPv6Config) GetSlaac() bool { + if x != nil { + return x.Slaac + } + return false +} + +func (x *IPv6Config) GetPrivacyExtensions() bool { + if x != nil { + return x.PrivacyExtensions + } + return false +} + +func (x *IPv6Config) GetLinkLocal() string { + if x != nil { + return x.LinkLocal + } + return "" +} + +// Rate limiting configuration +type RateLimit struct { + state protoimpl.MessageState `protogen:"open.v1"` + Bandwidth int64 `protobuf:"varint,1,opt,name=bandwidth,proto3" json:"bandwidth,omitempty"` // Bandwidth in bytes/second + RefillTime int64 `protobuf:"varint,2,opt,name=refill_time,json=refillTime,proto3" json:"refill_time,omitempty"` // Token bucket refill time in milliseconds + Burst int64 `protobuf:"varint,3,opt,name=burst,proto3" json:"burst,omitempty"` // Burst size in bytes + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *RateLimit) Reset() { + *x = RateLimit{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[9] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RateLimit) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RateLimit) ProtoMessage() {} + +func (x *RateLimit) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[9] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RateLimit.ProtoReflect.Descriptor instead. +func (*RateLimit) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{9} +} + +func (x *RateLimit) GetBandwidth() int64 { + if x != nil { + return x.Bandwidth + } + return 0 +} + +func (x *RateLimit) GetRefillTime() int64 { + if x != nil { + return x.RefillTime + } + return 0 +} + +func (x *RateLimit) GetBurst() int64 { + if x != nil { + return x.Burst + } + return 0 +} + +type ConsoleConfig struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Whether console is enabled + Enabled bool `protobuf:"varint,1,opt,name=enabled,proto3" json:"enabled,omitempty"` + // Console output destination (file path, pty, etc.) + Output string `protobuf:"bytes,2,opt,name=output,proto3" json:"output,omitempty"` + // Console input source (optional) + Input string `protobuf:"bytes,3,opt,name=input,proto3" json:"input,omitempty"` + // Console type (serial, virtio-console, etc.) + ConsoleType string `protobuf:"bytes,4,opt,name=console_type,json=consoleType,proto3" json:"console_type,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ConsoleConfig) Reset() { + *x = ConsoleConfig{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[10] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ConsoleConfig) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ConsoleConfig) ProtoMessage() {} + +func (x *ConsoleConfig) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[10] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ConsoleConfig.ProtoReflect.Descriptor instead. +func (*ConsoleConfig) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{10} +} + +func (x *ConsoleConfig) GetEnabled() bool { + if x != nil { + return x.Enabled + } + return false +} + +func (x *ConsoleConfig) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +func (x *ConsoleConfig) GetInput() string { + if x != nil { + return x.Input + } + return "" +} + +func (x *ConsoleConfig) GetConsoleType() string { + if x != nil { + return x.ConsoleType + } + return "" +} + +// Request/Response messages +type CreateVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Unique identifier for the VM (optional, will be generated if not provided) + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + // VM configuration + Config *VmConfig `protobuf:"bytes,2,opt,name=config,proto3" json:"config,omitempty"` + // Customer identifier for billing and isolation + CustomerId string `protobuf:"bytes,3,opt,name=customer_id,json=customerId,proto3" json:"customer_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CreateVmRequest) Reset() { + *x = CreateVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[11] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CreateVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CreateVmRequest) ProtoMessage() {} + +func (x *CreateVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[11] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CreateVmRequest.ProtoReflect.Descriptor instead. +func (*CreateVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{11} +} + +func (x *CreateVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *CreateVmRequest) GetConfig() *VmConfig { + if x != nil { + return x.Config + } + return nil +} + +func (x *CreateVmRequest) GetCustomerId() string { + if x != nil { + return x.CustomerId + } + return "" +} + +type CreateVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Assigned VM identifier + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + // Current VM state after creation + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CreateVmResponse) Reset() { + *x = CreateVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[12] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CreateVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CreateVmResponse) ProtoMessage() {} + +func (x *CreateVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[12] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CreateVmResponse.ProtoReflect.Descriptor instead. +func (*CreateVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{12} +} + +func (x *CreateVmResponse) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *CreateVmResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +type DeleteVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + // Whether to force deletion even if VM is running + Force bool `protobuf:"varint,2,opt,name=force,proto3" json:"force,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *DeleteVmRequest) Reset() { + *x = DeleteVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[13] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *DeleteVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeleteVmRequest) ProtoMessage() {} + +func (x *DeleteVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[13] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeleteVmRequest.ProtoReflect.Descriptor instead. +func (*DeleteVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{13} +} + +func (x *DeleteVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *DeleteVmRequest) GetForce() bool { + if x != nil { + return x.Force + } + return false +} + +type DeleteVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *DeleteVmResponse) Reset() { + *x = DeleteVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[14] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *DeleteVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeleteVmResponse) ProtoMessage() {} + +func (x *DeleteVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[14] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeleteVmResponse.ProtoReflect.Descriptor instead. +func (*DeleteVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{14} +} + +func (x *DeleteVmResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +type BootVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BootVmRequest) Reset() { + *x = BootVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[15] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BootVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BootVmRequest) ProtoMessage() {} + +func (x *BootVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[15] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BootVmRequest.ProtoReflect.Descriptor instead. +func (*BootVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{15} +} + +func (x *BootVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +type BootVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BootVmResponse) Reset() { + *x = BootVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[16] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BootVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BootVmResponse) ProtoMessage() {} + +func (x *BootVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[16] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BootVmResponse.ProtoReflect.Descriptor instead. +func (*BootVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{16} +} + +func (x *BootVmResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *BootVmResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +type ShutdownVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + // Whether to force shutdown (vs graceful) + Force bool `protobuf:"varint,2,opt,name=force,proto3" json:"force,omitempty"` + // Timeout for graceful shutdown (seconds) + TimeoutSeconds int32 `protobuf:"varint,3,opt,name=timeout_seconds,json=timeoutSeconds,proto3" json:"timeout_seconds,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ShutdownVmRequest) Reset() { + *x = ShutdownVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[17] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ShutdownVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ShutdownVmRequest) ProtoMessage() {} + +func (x *ShutdownVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[17] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ShutdownVmRequest.ProtoReflect.Descriptor instead. +func (*ShutdownVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{17} +} + +func (x *ShutdownVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *ShutdownVmRequest) GetForce() bool { + if x != nil { + return x.Force + } + return false +} + +func (x *ShutdownVmRequest) GetTimeoutSeconds() int32 { + if x != nil { + return x.TimeoutSeconds + } + return 0 +} + +type ShutdownVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ShutdownVmResponse) Reset() { + *x = ShutdownVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[18] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ShutdownVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ShutdownVmResponse) ProtoMessage() {} + +func (x *ShutdownVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[18] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ShutdownVmResponse.ProtoReflect.Descriptor instead. +func (*ShutdownVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{18} +} + +func (x *ShutdownVmResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *ShutdownVmResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +type PauseVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *PauseVmRequest) Reset() { + *x = PauseVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[19] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *PauseVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PauseVmRequest) ProtoMessage() {} + +func (x *PauseVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[19] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PauseVmRequest.ProtoReflect.Descriptor instead. +func (*PauseVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{19} +} + +func (x *PauseVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +type PauseVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *PauseVmResponse) Reset() { + *x = PauseVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[20] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *PauseVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PauseVmResponse) ProtoMessage() {} + +func (x *PauseVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[20] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PauseVmResponse.ProtoReflect.Descriptor instead. +func (*PauseVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{20} +} + +func (x *PauseVmResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *PauseVmResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +type ResumeVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ResumeVmRequest) Reset() { + *x = ResumeVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[21] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ResumeVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResumeVmRequest) ProtoMessage() {} + +func (x *ResumeVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[21] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResumeVmRequest.ProtoReflect.Descriptor instead. +func (*ResumeVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{21} +} + +func (x *ResumeVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +type ResumeVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ResumeVmResponse) Reset() { + *x = ResumeVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[22] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ResumeVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResumeVmResponse) ProtoMessage() {} + +func (x *ResumeVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[22] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResumeVmResponse.ProtoReflect.Descriptor instead. +func (*ResumeVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{22} +} + +func (x *ResumeVmResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *ResumeVmResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +type RebootVmRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + // Whether to force reboot (vs graceful) + Force bool `protobuf:"varint,2,opt,name=force,proto3" json:"force,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *RebootVmRequest) Reset() { + *x = RebootVmRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[23] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RebootVmRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RebootVmRequest) ProtoMessage() {} + +func (x *RebootVmRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[23] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RebootVmRequest.ProtoReflect.Descriptor instead. +func (*RebootVmRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{23} +} + +func (x *RebootVmRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *RebootVmRequest) GetForce() bool { + if x != nil { + return x.Force + } + return false +} + +type RebootVmResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Success bool `protobuf:"varint,1,opt,name=success,proto3" json:"success,omitempty"` + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *RebootVmResponse) Reset() { + *x = RebootVmResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[24] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RebootVmResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RebootVmResponse) ProtoMessage() {} + +func (x *RebootVmResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[24] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RebootVmResponse.ProtoReflect.Descriptor instead. +func (*RebootVmResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{24} +} + +func (x *RebootVmResponse) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *RebootVmResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +type GetVmInfoRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetVmInfoRequest) Reset() { + *x = GetVmInfoRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[25] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetVmInfoRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetVmInfoRequest) ProtoMessage() {} + +func (x *GetVmInfoRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[25] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetVmInfoRequest.ProtoReflect.Descriptor instead. +func (*GetVmInfoRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{25} +} + +func (x *GetVmInfoRequest) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +type GetVmInfoResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + Config *VmConfig `protobuf:"bytes,2,opt,name=config,proto3" json:"config,omitempty"` + State VmState `protobuf:"varint,3,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + Metrics *VmMetrics `protobuf:"bytes,4,opt,name=metrics,proto3" json:"metrics,omitempty"` + // Backend-specific information + BackendInfo map[string]string `protobuf:"bytes,5,rep,name=backend_info,json=backendInfo,proto3" json:"backend_info,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + // Network information if available + NetworkInfo *VmNetworkInfo `protobuf:"bytes,6,opt,name=network_info,json=networkInfo,proto3" json:"network_info,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetVmInfoResponse) Reset() { + *x = GetVmInfoResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[26] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetVmInfoResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetVmInfoResponse) ProtoMessage() {} + +func (x *GetVmInfoResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[26] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetVmInfoResponse.ProtoReflect.Descriptor instead. +func (*GetVmInfoResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{26} +} + +func (x *GetVmInfoResponse) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *GetVmInfoResponse) GetConfig() *VmConfig { + if x != nil { + return x.Config + } + return nil +} + +func (x *GetVmInfoResponse) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +func (x *GetVmInfoResponse) GetMetrics() *VmMetrics { + if x != nil { + return x.Metrics + } + return nil +} + +func (x *GetVmInfoResponse) GetBackendInfo() map[string]string { + if x != nil { + return x.BackendInfo + } + return nil +} + +func (x *GetVmInfoResponse) GetNetworkInfo() *VmNetworkInfo { + if x != nil { + return x.NetworkInfo + } + return nil +} + +// Port mapping for VM network forwarding +type PortMapping struct { + state protoimpl.MessageState `protogen:"open.v1"` + ContainerPort int32 `protobuf:"varint,1,opt,name=container_port,json=containerPort,proto3" json:"container_port,omitempty"` // Port inside the VM + HostPort int32 `protobuf:"varint,2,opt,name=host_port,json=hostPort,proto3" json:"host_port,omitempty"` // Port on the host system + Protocol string `protobuf:"bytes,3,opt,name=protocol,proto3" json:"protocol,omitempty"` // Protocol (tcp, udp) + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *PortMapping) Reset() { + *x = PortMapping{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[27] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *PortMapping) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PortMapping) ProtoMessage() {} + +func (x *PortMapping) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[27] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PortMapping.ProtoReflect.Descriptor instead. +func (*PortMapping) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{27} +} + +func (x *PortMapping) GetContainerPort() int32 { + if x != nil { + return x.ContainerPort + } + return 0 +} + +func (x *PortMapping) GetHostPort() int32 { + if x != nil { + return x.HostPort + } + return 0 +} + +func (x *PortMapping) GetProtocol() string { + if x != nil { + return x.Protocol + } + return "" +} + +// Network information for a VM +type VmNetworkInfo struct { + state protoimpl.MessageState `protogen:"open.v1"` + IpAddress string `protobuf:"bytes,1,opt,name=ip_address,json=ipAddress,proto3" json:"ip_address,omitempty"` + MacAddress string `protobuf:"bytes,2,opt,name=mac_address,json=macAddress,proto3" json:"mac_address,omitempty"` + TapDevice string `protobuf:"bytes,3,opt,name=tap_device,json=tapDevice,proto3" json:"tap_device,omitempty"` + NetworkNamespace string `protobuf:"bytes,4,opt,name=network_namespace,json=networkNamespace,proto3" json:"network_namespace,omitempty"` + Gateway string `protobuf:"bytes,5,opt,name=gateway,proto3" json:"gateway,omitempty"` + DnsServers []string `protobuf:"bytes,6,rep,name=dns_servers,json=dnsServers,proto3" json:"dns_servers,omitempty"` + PortMappings []*PortMapping `protobuf:"bytes,7,rep,name=port_mappings,json=portMappings,proto3" json:"port_mappings,omitempty"` // Port forwards from host to VM + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *VmNetworkInfo) Reset() { + *x = VmNetworkInfo{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[28] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *VmNetworkInfo) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*VmNetworkInfo) ProtoMessage() {} + +func (x *VmNetworkInfo) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[28] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use VmNetworkInfo.ProtoReflect.Descriptor instead. +func (*VmNetworkInfo) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{28} +} + +func (x *VmNetworkInfo) GetIpAddress() string { + if x != nil { + return x.IpAddress + } + return "" +} + +func (x *VmNetworkInfo) GetMacAddress() string { + if x != nil { + return x.MacAddress + } + return "" +} + +func (x *VmNetworkInfo) GetTapDevice() string { + if x != nil { + return x.TapDevice + } + return "" +} + +func (x *VmNetworkInfo) GetNetworkNamespace() string { + if x != nil { + return x.NetworkNamespace + } + return "" +} + +func (x *VmNetworkInfo) GetGateway() string { + if x != nil { + return x.Gateway + } + return "" +} + +func (x *VmNetworkInfo) GetDnsServers() []string { + if x != nil { + return x.DnsServers + } + return nil +} + +func (x *VmNetworkInfo) GetPortMappings() []*PortMapping { + if x != nil { + return x.PortMappings + } + return nil +} + +type VmMetrics struct { + state protoimpl.MessageState `protogen:"open.v1"` + // CPU usage percentage (0-100) + CpuUsagePercent float64 `protobuf:"fixed64,1,opt,name=cpu_usage_percent,json=cpuUsagePercent,proto3" json:"cpu_usage_percent,omitempty"` + // Memory usage in bytes + MemoryUsageBytes int64 `protobuf:"varint,2,opt,name=memory_usage_bytes,json=memoryUsageBytes,proto3" json:"memory_usage_bytes,omitempty"` + // Network I/O statistics + NetworkStats *NetworkStats `protobuf:"bytes,3,opt,name=network_stats,json=networkStats,proto3" json:"network_stats,omitempty"` + // Storage I/O statistics + StorageStats *StorageStats `protobuf:"bytes,4,opt,name=storage_stats,json=storageStats,proto3" json:"storage_stats,omitempty"` + // VM uptime in seconds + UptimeSeconds int64 `protobuf:"varint,5,opt,name=uptime_seconds,json=uptimeSeconds,proto3" json:"uptime_seconds,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *VmMetrics) Reset() { + *x = VmMetrics{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[29] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *VmMetrics) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*VmMetrics) ProtoMessage() {} + +func (x *VmMetrics) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[29] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use VmMetrics.ProtoReflect.Descriptor instead. +func (*VmMetrics) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{29} +} + +func (x *VmMetrics) GetCpuUsagePercent() float64 { + if x != nil { + return x.CpuUsagePercent + } + return 0 +} + +func (x *VmMetrics) GetMemoryUsageBytes() int64 { + if x != nil { + return x.MemoryUsageBytes + } + return 0 +} + +func (x *VmMetrics) GetNetworkStats() *NetworkStats { + if x != nil { + return x.NetworkStats + } + return nil +} + +func (x *VmMetrics) GetStorageStats() *StorageStats { + if x != nil { + return x.StorageStats + } + return nil +} + +func (x *VmMetrics) GetUptimeSeconds() int64 { + if x != nil { + return x.UptimeSeconds + } + return 0 +} + +type NetworkStats struct { + state protoimpl.MessageState `protogen:"open.v1"` + BytesReceived int64 `protobuf:"varint,1,opt,name=bytes_received,json=bytesReceived,proto3" json:"bytes_received,omitempty"` + BytesTransmitted int64 `protobuf:"varint,2,opt,name=bytes_transmitted,json=bytesTransmitted,proto3" json:"bytes_transmitted,omitempty"` + PacketsReceived int64 `protobuf:"varint,3,opt,name=packets_received,json=packetsReceived,proto3" json:"packets_received,omitempty"` + PacketsTransmitted int64 `protobuf:"varint,4,opt,name=packets_transmitted,json=packetsTransmitted,proto3" json:"packets_transmitted,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *NetworkStats) Reset() { + *x = NetworkStats{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[30] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *NetworkStats) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NetworkStats) ProtoMessage() {} + +func (x *NetworkStats) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[30] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NetworkStats.ProtoReflect.Descriptor instead. +func (*NetworkStats) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{30} +} + +func (x *NetworkStats) GetBytesReceived() int64 { + if x != nil { + return x.BytesReceived + } + return 0 +} + +func (x *NetworkStats) GetBytesTransmitted() int64 { + if x != nil { + return x.BytesTransmitted + } + return 0 +} + +func (x *NetworkStats) GetPacketsReceived() int64 { + if x != nil { + return x.PacketsReceived + } + return 0 +} + +func (x *NetworkStats) GetPacketsTransmitted() int64 { + if x != nil { + return x.PacketsTransmitted + } + return 0 +} + +type StorageStats struct { + state protoimpl.MessageState `protogen:"open.v1"` + BytesRead int64 `protobuf:"varint,1,opt,name=bytes_read,json=bytesRead,proto3" json:"bytes_read,omitempty"` + BytesWritten int64 `protobuf:"varint,2,opt,name=bytes_written,json=bytesWritten,proto3" json:"bytes_written,omitempty"` + ReadOperations int64 `protobuf:"varint,3,opt,name=read_operations,json=readOperations,proto3" json:"read_operations,omitempty"` + WriteOperations int64 `protobuf:"varint,4,opt,name=write_operations,json=writeOperations,proto3" json:"write_operations,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *StorageStats) Reset() { + *x = StorageStats{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[31] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *StorageStats) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*StorageStats) ProtoMessage() {} + +func (x *StorageStats) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[31] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use StorageStats.ProtoReflect.Descriptor instead. +func (*StorageStats) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{31} +} + +func (x *StorageStats) GetBytesRead() int64 { + if x != nil { + return x.BytesRead + } + return 0 +} + +func (x *StorageStats) GetBytesWritten() int64 { + if x != nil { + return x.BytesWritten + } + return 0 +} + +func (x *StorageStats) GetReadOperations() int64 { + if x != nil { + return x.ReadOperations + } + return 0 +} + +func (x *StorageStats) GetWriteOperations() int64 { + if x != nil { + return x.WriteOperations + } + return 0 +} + +type ListVmsRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Optional filter by state + StateFilter []VmState `protobuf:"varint,1,rep,packed,name=state_filter,json=stateFilter,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state_filter,omitempty"` + // Pagination + PageSize int32 `protobuf:"varint,2,opt,name=page_size,json=pageSize,proto3" json:"page_size,omitempty"` + PageToken string `protobuf:"bytes,3,opt,name=page_token,json=pageToken,proto3" json:"page_token,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ListVmsRequest) Reset() { + *x = ListVmsRequest{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[32] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ListVmsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListVmsRequest) ProtoMessage() {} + +func (x *ListVmsRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[32] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListVmsRequest.ProtoReflect.Descriptor instead. +func (*ListVmsRequest) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{32} +} + +func (x *ListVmsRequest) GetStateFilter() []VmState { + if x != nil { + return x.StateFilter + } + return nil +} + +func (x *ListVmsRequest) GetPageSize() int32 { + if x != nil { + return x.PageSize + } + return 0 +} + +func (x *ListVmsRequest) GetPageToken() string { + if x != nil { + return x.PageToken + } + return "" +} + +type ListVmsResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Vms []*VmInfo `protobuf:"bytes,1,rep,name=vms,proto3" json:"vms,omitempty"` + NextPageToken string `protobuf:"bytes,2,opt,name=next_page_token,json=nextPageToken,proto3" json:"next_page_token,omitempty"` + TotalCount int32 `protobuf:"varint,3,opt,name=total_count,json=totalCount,proto3" json:"total_count,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ListVmsResponse) Reset() { + *x = ListVmsResponse{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[33] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ListVmsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ListVmsResponse) ProtoMessage() {} + +func (x *ListVmsResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[33] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ListVmsResponse.ProtoReflect.Descriptor instead. +func (*ListVmsResponse) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{33} +} + +func (x *ListVmsResponse) GetVms() []*VmInfo { + if x != nil { + return x.Vms + } + return nil +} + +func (x *ListVmsResponse) GetNextPageToken() string { + if x != nil { + return x.NextPageToken + } + return "" +} + +func (x *ListVmsResponse) GetTotalCount() int32 { + if x != nil { + return x.TotalCount + } + return 0 +} + +type VmInfo struct { + state protoimpl.MessageState `protogen:"open.v1"` + VmId string `protobuf:"bytes,1,opt,name=vm_id,json=vmId,proto3" json:"vm_id,omitempty"` + State VmState `protobuf:"varint,2,opt,name=state,proto3,enum=metal.vmprovisioner.v1.VmState" json:"state,omitempty"` + // Basic config info (subset of full config) + VcpuCount int32 `protobuf:"varint,3,opt,name=vcpu_count,json=vcpuCount,proto3" json:"vcpu_count,omitempty"` + MemorySizeBytes int64 `protobuf:"varint,4,opt,name=memory_size_bytes,json=memorySizeBytes,proto3" json:"memory_size_bytes,omitempty"` + // Creation and modification timestamps + CreatedTimestamp int64 `protobuf:"varint,5,opt,name=created_timestamp,json=createdTimestamp,proto3" json:"created_timestamp,omitempty"` + ModifiedTimestamp int64 `protobuf:"varint,6,opt,name=modified_timestamp,json=modifiedTimestamp,proto3" json:"modified_timestamp,omitempty"` + // Metadata + Metadata map[string]string `protobuf:"bytes,7,rep,name=metadata,proto3" json:"metadata,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + // Customer identifier + CustomerId string `protobuf:"bytes,8,opt,name=customer_id,json=customerId,proto3" json:"customer_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *VmInfo) Reset() { + *x = VmInfo{} + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[34] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *VmInfo) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*VmInfo) ProtoMessage() {} + +func (x *VmInfo) ProtoReflect() protoreflect.Message { + mi := &file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes[34] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use VmInfo.ProtoReflect.Descriptor instead. +func (*VmInfo) Descriptor() ([]byte, []int) { + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP(), []int{34} +} + +func (x *VmInfo) GetVmId() string { + if x != nil { + return x.VmId + } + return "" +} + +func (x *VmInfo) GetState() VmState { + if x != nil { + return x.State + } + return VmState_VM_STATE_UNSPECIFIED +} + +func (x *VmInfo) GetVcpuCount() int32 { + if x != nil { + return x.VcpuCount + } + return 0 +} + +func (x *VmInfo) GetMemorySizeBytes() int64 { + if x != nil { + return x.MemorySizeBytes + } + return 0 +} + +func (x *VmInfo) GetCreatedTimestamp() int64 { + if x != nil { + return x.CreatedTimestamp + } + return 0 +} + +func (x *VmInfo) GetModifiedTimestamp() int64 { + if x != nil { + return x.ModifiedTimestamp + } + return 0 +} + +func (x *VmInfo) GetMetadata() map[string]string { + if x != nil { + return x.Metadata + } + return nil +} + +func (x *VmInfo) GetCustomerId() string { + if x != nil { + return x.CustomerId + } + return "" +} + +var File_proto_metal_vmprovisioner_v1_vmprovisioner_proto protoreflect.FileDescriptor + +const file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDesc = "" + + "\n" + + "0proto/metal/vmprovisioner/v1/vmprovisioner.proto\x12\x16metal.vmprovisioner.v1\"\x84\x04\n" + + "\bVmConfig\x123\n" + + "\x03cpu\x18\x01 \x01(\v2!.metal.vmprovisioner.v1.CpuConfigR\x03cpu\x12<\n" + + "\x06memory\x18\x02 \x01(\v2$.metal.vmprovisioner.v1.MemoryConfigR\x06memory\x126\n" + + "\x04boot\x18\x03 \x01(\v2\".metal.vmprovisioner.v1.BootConfigR\x04boot\x12?\n" + + "\astorage\x18\x04 \x03(\v2%.metal.vmprovisioner.v1.StorageDeviceR\astorage\x12B\n" + + "\anetwork\x18\x05 \x03(\v2(.metal.vmprovisioner.v1.NetworkInterfaceR\anetwork\x12?\n" + + "\aconsole\x18\x06 \x01(\v2%.metal.vmprovisioner.v1.ConsoleConfigR\aconsole\x12J\n" + + "\bmetadata\x18\a \x03(\v2..metal.vmprovisioner.v1.VmConfig.MetadataEntryR\bmetadata\x1a;\n" + + "\rMetadataEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x9b\x02\n" + + "\tCpuConfig\x12\x1d\n" + + "\n" + + "vcpu_count\x18\x01 \x01(\x05R\tvcpuCount\x12$\n" + + "\x0emax_vcpu_count\x18\x02 \x01(\x05R\fmaxVcpuCount\x12?\n" + + "\btopology\x18\x03 \x01(\v2#.metal.vmprovisioner.v1.CpuTopologyR\btopology\x12K\n" + + "\bfeatures\x18\x04 \x03(\v2/.metal.vmprovisioner.v1.CpuConfig.FeaturesEntryR\bfeatures\x1a;\n" + + "\rFeaturesEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"{\n" + + "\vCpuTopology\x12\x18\n" + + "\asockets\x18\x01 \x01(\x05R\asockets\x12(\n" + + "\x10cores_per_socket\x18\x02 \x01(\x05R\x0ecoresPerSocket\x12(\n" + + "\x10threads_per_core\x18\x03 \x01(\x05R\x0ethreadsPerCore\"\x85\x02\n" + + "\fMemoryConfig\x12\x1d\n" + + "\n" + + "size_bytes\x18\x01 \x01(\x03R\tsizeBytes\x12'\n" + + "\x0fhotplug_enabled\x18\x02 \x01(\bR\x0ehotplugEnabled\x12$\n" + + "\x0emax_size_bytes\x18\x03 \x01(\x03R\fmaxSizeBytes\x12K\n" + + "\abacking\x18\x04 \x03(\v21.metal.vmprovisioner.v1.MemoryConfig.BackingEntryR\abacking\x1a:\n" + + "\fBackingEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x87\x02\n" + + "\n" + + "BootConfig\x12\x1f\n" + + "\vkernel_path\x18\x01 \x01(\tR\n" + + "kernelPath\x12\x1f\n" + + "\vinitrd_path\x18\x02 \x01(\tR\n" + + "initrdPath\x12\x1f\n" + + "\vkernel_args\x18\x03 \x01(\tR\n" + + "kernelArgs\x12V\n" + + "\fboot_options\x18\x04 \x03(\v23.metal.vmprovisioner.v1.BootConfig.BootOptionsEntryR\vbootOptions\x1a>\n" + + "\x10BootOptionsEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xa7\x02\n" + + "\rStorageDevice\x12\x0e\n" + + "\x02id\x18\x01 \x01(\tR\x02id\x12\x12\n" + + "\x04path\x18\x02 \x01(\tR\x04path\x12\x1b\n" + + "\tread_only\x18\x03 \x01(\bR\breadOnly\x12$\n" + + "\x0eis_root_device\x18\x04 \x01(\bR\fisRootDevice\x12%\n" + + "\x0einterface_type\x18\x05 \x01(\tR\rinterfaceType\x12L\n" + + "\aoptions\x18\x06 \x03(\v22.metal.vmprovisioner.v1.StorageDevice.OptionsEntryR\aoptions\x1a:\n" + + "\fOptionsEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\xe7\x04\n" + + "\x10NetworkInterface\x12\x0e\n" + + "\x02id\x18\x01 \x01(\tR\x02id\x12\x1f\n" + + "\vmac_address\x18\x02 \x01(\tR\n" + + "macAddress\x12\x1d\n" + + "\n" + + "tap_device\x18\x03 \x01(\tR\ttapDevice\x12%\n" + + "\x0einterface_type\x18\x04 \x01(\tR\rinterfaceType\x12O\n" + + "\aoptions\x18\x05 \x03(\v25.metal.vmprovisioner.v1.NetworkInterface.OptionsEntryR\aoptions\x12C\n" + + "\vipv4_config\x18\x06 \x01(\v2\".metal.vmprovisioner.v1.IPv4ConfigR\n" + + "ipv4Config\x12C\n" + + "\vipv6_config\x18\a \x01(\v2\".metal.vmprovisioner.v1.IPv6ConfigR\n" + + "ipv6Config\x127\n" + + "\x04mode\x18\b \x01(\x0e2#.metal.vmprovisioner.v1.NetworkModeR\x04mode\x12E\n" + + "\rrx_rate_limit\x18\n" + + " \x01(\v2!.metal.vmprovisioner.v1.RateLimitR\vrxRateLimit\x12E\n" + + "\rtx_rate_limit\x18\v \x01(\v2!.metal.vmprovisioner.v1.RateLimitR\vtxRateLimit\x1a:\n" + + "\fOptionsEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x8f\x01\n" + + "\n" + + "IPv4Config\x12\x18\n" + + "\aaddress\x18\x01 \x01(\tR\aaddress\x12\x18\n" + + "\anetmask\x18\x02 \x01(\tR\anetmask\x12\x18\n" + + "\agateway\x18\x03 \x01(\tR\agateway\x12\x1f\n" + + "\vdns_servers\x18\x04 \x03(\tR\n" + + "dnsServers\x12\x12\n" + + "\x04dhcp\x18\x05 \x01(\bR\x04dhcp\"\xea\x01\n" + + "\n" + + "IPv6Config\x12\x18\n" + + "\aaddress\x18\x01 \x01(\tR\aaddress\x12#\n" + + "\rprefix_length\x18\x02 \x01(\x05R\fprefixLength\x12\x18\n" + + "\agateway\x18\x03 \x01(\tR\agateway\x12\x1f\n" + + "\vdns_servers\x18\x04 \x03(\tR\n" + + "dnsServers\x12\x14\n" + + "\x05slaac\x18\x05 \x01(\bR\x05slaac\x12-\n" + + "\x12privacy_extensions\x18\x06 \x01(\bR\x11privacyExtensions\x12\x1d\n" + + "\n" + + "link_local\x18\a \x01(\tR\tlinkLocal\"`\n" + + "\tRateLimit\x12\x1c\n" + + "\tbandwidth\x18\x01 \x01(\x03R\tbandwidth\x12\x1f\n" + + "\vrefill_time\x18\x02 \x01(\x03R\n" + + "refillTime\x12\x14\n" + + "\x05burst\x18\x03 \x01(\x03R\x05burst\"z\n" + + "\rConsoleConfig\x12\x18\n" + + "\aenabled\x18\x01 \x01(\bR\aenabled\x12\x16\n" + + "\x06output\x18\x02 \x01(\tR\x06output\x12\x14\n" + + "\x05input\x18\x03 \x01(\tR\x05input\x12!\n" + + "\fconsole_type\x18\x04 \x01(\tR\vconsoleType\"\x81\x01\n" + + "\x0fCreateVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x128\n" + + "\x06config\x18\x02 \x01(\v2 .metal.vmprovisioner.v1.VmConfigR\x06config\x12\x1f\n" + + "\vcustomer_id\x18\x03 \x01(\tR\n" + + "customerId\"^\n" + + "\x10CreateVmResponse\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\"<\n" + + "\x0fDeleteVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x12\x14\n" + + "\x05force\x18\x02 \x01(\bR\x05force\",\n" + + "\x10DeleteVmResponse\x12\x18\n" + + "\asuccess\x18\x01 \x01(\bR\asuccess\"$\n" + + "\rBootVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\"a\n" + + "\x0eBootVmResponse\x12\x18\n" + + "\asuccess\x18\x01 \x01(\bR\asuccess\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\"g\n" + + "\x11ShutdownVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x12\x14\n" + + "\x05force\x18\x02 \x01(\bR\x05force\x12'\n" + + "\x0ftimeout_seconds\x18\x03 \x01(\x05R\x0etimeoutSeconds\"e\n" + + "\x12ShutdownVmResponse\x12\x18\n" + + "\asuccess\x18\x01 \x01(\bR\asuccess\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\"%\n" + + "\x0ePauseVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\"b\n" + + "\x0fPauseVmResponse\x12\x18\n" + + "\asuccess\x18\x01 \x01(\bR\asuccess\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\"&\n" + + "\x0fResumeVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\"c\n" + + "\x10ResumeVmResponse\x12\x18\n" + + "\asuccess\x18\x01 \x01(\bR\asuccess\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\"<\n" + + "\x0fRebootVmRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x12\x14\n" + + "\x05force\x18\x02 \x01(\bR\x05force\"c\n" + + "\x10RebootVmResponse\x12\x18\n" + + "\asuccess\x18\x01 \x01(\bR\asuccess\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\"'\n" + + "\x10GetVmInfoRequest\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\"\xbf\x03\n" + + "\x11GetVmInfoResponse\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x128\n" + + "\x06config\x18\x02 \x01(\v2 .metal.vmprovisioner.v1.VmConfigR\x06config\x125\n" + + "\x05state\x18\x03 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\x12;\n" + + "\ametrics\x18\x04 \x01(\v2!.metal.vmprovisioner.v1.VmMetricsR\ametrics\x12]\n" + + "\fbackend_info\x18\x05 \x03(\v2:.metal.vmprovisioner.v1.GetVmInfoResponse.BackendInfoEntryR\vbackendInfo\x12H\n" + + "\fnetwork_info\x18\x06 \x01(\v2%.metal.vmprovisioner.v1.VmNetworkInfoR\vnetworkInfo\x1a>\n" + + "\x10BackendInfoEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"m\n" + + "\vPortMapping\x12%\n" + + "\x0econtainer_port\x18\x01 \x01(\x05R\rcontainerPort\x12\x1b\n" + + "\thost_port\x18\x02 \x01(\x05R\bhostPort\x12\x1a\n" + + "\bprotocol\x18\x03 \x01(\tR\bprotocol\"\xa0\x02\n" + + "\rVmNetworkInfo\x12\x1d\n" + + "\n" + + "ip_address\x18\x01 \x01(\tR\tipAddress\x12\x1f\n" + + "\vmac_address\x18\x02 \x01(\tR\n" + + "macAddress\x12\x1d\n" + + "\n" + + "tap_device\x18\x03 \x01(\tR\ttapDevice\x12+\n" + + "\x11network_namespace\x18\x04 \x01(\tR\x10networkNamespace\x12\x18\n" + + "\agateway\x18\x05 \x01(\tR\agateway\x12\x1f\n" + + "\vdns_servers\x18\x06 \x03(\tR\n" + + "dnsServers\x12H\n" + + "\rport_mappings\x18\a \x03(\v2#.metal.vmprovisioner.v1.PortMappingR\fportMappings\"\xa2\x02\n" + + "\tVmMetrics\x12*\n" + + "\x11cpu_usage_percent\x18\x01 \x01(\x01R\x0fcpuUsagePercent\x12,\n" + + "\x12memory_usage_bytes\x18\x02 \x01(\x03R\x10memoryUsageBytes\x12I\n" + + "\rnetwork_stats\x18\x03 \x01(\v2$.metal.vmprovisioner.v1.NetworkStatsR\fnetworkStats\x12I\n" + + "\rstorage_stats\x18\x04 \x01(\v2$.metal.vmprovisioner.v1.StorageStatsR\fstorageStats\x12%\n" + + "\x0euptime_seconds\x18\x05 \x01(\x03R\ruptimeSeconds\"\xbe\x01\n" + + "\fNetworkStats\x12%\n" + + "\x0ebytes_received\x18\x01 \x01(\x03R\rbytesReceived\x12+\n" + + "\x11bytes_transmitted\x18\x02 \x01(\x03R\x10bytesTransmitted\x12)\n" + + "\x10packets_received\x18\x03 \x01(\x03R\x0fpacketsReceived\x12/\n" + + "\x13packets_transmitted\x18\x04 \x01(\x03R\x12packetsTransmitted\"\xa6\x01\n" + + "\fStorageStats\x12\x1d\n" + + "\n" + + "bytes_read\x18\x01 \x01(\x03R\tbytesRead\x12#\n" + + "\rbytes_written\x18\x02 \x01(\x03R\fbytesWritten\x12'\n" + + "\x0fread_operations\x18\x03 \x01(\x03R\x0ereadOperations\x12)\n" + + "\x10write_operations\x18\x04 \x01(\x03R\x0fwriteOperations\"\x90\x01\n" + + "\x0eListVmsRequest\x12B\n" + + "\fstate_filter\x18\x01 \x03(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\vstateFilter\x12\x1b\n" + + "\tpage_size\x18\x02 \x01(\x05R\bpageSize\x12\x1d\n" + + "\n" + + "page_token\x18\x03 \x01(\tR\tpageToken\"\x8c\x01\n" + + "\x0fListVmsResponse\x120\n" + + "\x03vms\x18\x01 \x03(\v2\x1e.metal.vmprovisioner.v1.VmInfoR\x03vms\x12&\n" + + "\x0fnext_page_token\x18\x02 \x01(\tR\rnextPageToken\x12\x1f\n" + + "\vtotal_count\x18\x03 \x01(\x05R\n" + + "totalCount\"\xa3\x03\n" + + "\x06VmInfo\x12\x13\n" + + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x125\n" + + "\x05state\x18\x02 \x01(\x0e2\x1f.metal.vmprovisioner.v1.VmStateR\x05state\x12\x1d\n" + + "\n" + + "vcpu_count\x18\x03 \x01(\x05R\tvcpuCount\x12*\n" + + "\x11memory_size_bytes\x18\x04 \x01(\x03R\x0fmemorySizeBytes\x12+\n" + + "\x11created_timestamp\x18\x05 \x01(\x03R\x10createdTimestamp\x12-\n" + + "\x12modified_timestamp\x18\x06 \x01(\x03R\x11modifiedTimestamp\x12H\n" + + "\bmetadata\x18\a \x03(\v2,.metal.vmprovisioner.v1.VmInfo.MetadataEntryR\bmetadata\x12\x1f\n" + + "\vcustomer_id\x18\b \x01(\tR\n" + + "customerId\x1a;\n" + + "\rMetadataEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01*{\n" + + "\aVmState\x12\x18\n" + + "\x14VM_STATE_UNSPECIFIED\x10\x00\x12\x14\n" + + "\x10VM_STATE_CREATED\x10\x01\x12\x14\n" + + "\x10VM_STATE_RUNNING\x10\x02\x12\x13\n" + + "\x0fVM_STATE_PAUSED\x10\x03\x12\x15\n" + + "\x11VM_STATE_SHUTDOWN\x10\x04*\x80\x01\n" + + "\vNetworkMode\x12\x1c\n" + + "\x18NETWORK_MODE_UNSPECIFIED\x10\x00\x12\x1b\n" + + "\x17NETWORK_MODE_DUAL_STACK\x10\x01\x12\x1a\n" + + "\x16NETWORK_MODE_IPV4_ONLY\x10\x02\x12\x1a\n" + + "\x16NETWORK_MODE_IPV6_ONLY\x10\x032\xdf\x06\n" + + "\tVmService\x12]\n" + + "\bCreateVm\x12'.metal.vmprovisioner.v1.CreateVmRequest\x1a(.metal.vmprovisioner.v1.CreateVmResponse\x12]\n" + + "\bDeleteVm\x12'.metal.vmprovisioner.v1.DeleteVmRequest\x1a(.metal.vmprovisioner.v1.DeleteVmResponse\x12W\n" + + "\x06BootVm\x12%.metal.vmprovisioner.v1.BootVmRequest\x1a&.metal.vmprovisioner.v1.BootVmResponse\x12c\n" + + "\n" + + "ShutdownVm\x12).metal.vmprovisioner.v1.ShutdownVmRequest\x1a*.metal.vmprovisioner.v1.ShutdownVmResponse\x12Z\n" + + "\aPauseVm\x12&.metal.vmprovisioner.v1.PauseVmRequest\x1a'.metal.vmprovisioner.v1.PauseVmResponse\x12]\n" + + "\bResumeVm\x12'.metal.vmprovisioner.v1.ResumeVmRequest\x1a(.metal.vmprovisioner.v1.ResumeVmResponse\x12]\n" + + "\bRebootVm\x12'.metal.vmprovisioner.v1.RebootVmRequest\x1a(.metal.vmprovisioner.v1.RebootVmResponse\x12`\n" + + "\tGetVmInfo\x12(.metal.vmprovisioner.v1.GetVmInfoRequest\x1a).metal.vmprovisioner.v1.GetVmInfoResponse\x12Z\n" + + "\aListVms\x12&.metal.vmprovisioner.v1.ListVmsRequest\x1a'.metal.vmprovisioner.v1.ListVmsResponseBNZLgithub.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1;vmprovisionerv1b\x06proto3" + +var ( + file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescOnce sync.Once + file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescData []byte +) + +func file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescGZIP() []byte { + file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescOnce.Do(func() { + file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDesc), len(file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDesc))) + }) + return file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDescData +} + +var file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes = make([]protoimpl.MessageInfo, 43) +var file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_goTypes = []any{ + (VmState)(0), // 0: metal.vmprovisioner.v1.VmState + (NetworkMode)(0), // 1: metal.vmprovisioner.v1.NetworkMode + (*VmConfig)(nil), // 2: metal.vmprovisioner.v1.VmConfig + (*CpuConfig)(nil), // 3: metal.vmprovisioner.v1.CpuConfig + (*CpuTopology)(nil), // 4: metal.vmprovisioner.v1.CpuTopology + (*MemoryConfig)(nil), // 5: metal.vmprovisioner.v1.MemoryConfig + (*BootConfig)(nil), // 6: metal.vmprovisioner.v1.BootConfig + (*StorageDevice)(nil), // 7: metal.vmprovisioner.v1.StorageDevice + (*NetworkInterface)(nil), // 8: metal.vmprovisioner.v1.NetworkInterface + (*IPv4Config)(nil), // 9: metal.vmprovisioner.v1.IPv4Config + (*IPv6Config)(nil), // 10: metal.vmprovisioner.v1.IPv6Config + (*RateLimit)(nil), // 11: metal.vmprovisioner.v1.RateLimit + (*ConsoleConfig)(nil), // 12: metal.vmprovisioner.v1.ConsoleConfig + (*CreateVmRequest)(nil), // 13: metal.vmprovisioner.v1.CreateVmRequest + (*CreateVmResponse)(nil), // 14: metal.vmprovisioner.v1.CreateVmResponse + (*DeleteVmRequest)(nil), // 15: metal.vmprovisioner.v1.DeleteVmRequest + (*DeleteVmResponse)(nil), // 16: metal.vmprovisioner.v1.DeleteVmResponse + (*BootVmRequest)(nil), // 17: metal.vmprovisioner.v1.BootVmRequest + (*BootVmResponse)(nil), // 18: metal.vmprovisioner.v1.BootVmResponse + (*ShutdownVmRequest)(nil), // 19: metal.vmprovisioner.v1.ShutdownVmRequest + (*ShutdownVmResponse)(nil), // 20: metal.vmprovisioner.v1.ShutdownVmResponse + (*PauseVmRequest)(nil), // 21: metal.vmprovisioner.v1.PauseVmRequest + (*PauseVmResponse)(nil), // 22: metal.vmprovisioner.v1.PauseVmResponse + (*ResumeVmRequest)(nil), // 23: metal.vmprovisioner.v1.ResumeVmRequest + (*ResumeVmResponse)(nil), // 24: metal.vmprovisioner.v1.ResumeVmResponse + (*RebootVmRequest)(nil), // 25: metal.vmprovisioner.v1.RebootVmRequest + (*RebootVmResponse)(nil), // 26: metal.vmprovisioner.v1.RebootVmResponse + (*GetVmInfoRequest)(nil), // 27: metal.vmprovisioner.v1.GetVmInfoRequest + (*GetVmInfoResponse)(nil), // 28: metal.vmprovisioner.v1.GetVmInfoResponse + (*PortMapping)(nil), // 29: metal.vmprovisioner.v1.PortMapping + (*VmNetworkInfo)(nil), // 30: metal.vmprovisioner.v1.VmNetworkInfo + (*VmMetrics)(nil), // 31: metal.vmprovisioner.v1.VmMetrics + (*NetworkStats)(nil), // 32: metal.vmprovisioner.v1.NetworkStats + (*StorageStats)(nil), // 33: metal.vmprovisioner.v1.StorageStats + (*ListVmsRequest)(nil), // 34: metal.vmprovisioner.v1.ListVmsRequest + (*ListVmsResponse)(nil), // 35: metal.vmprovisioner.v1.ListVmsResponse + (*VmInfo)(nil), // 36: metal.vmprovisioner.v1.VmInfo + nil, // 37: metal.vmprovisioner.v1.VmConfig.MetadataEntry + nil, // 38: metal.vmprovisioner.v1.CpuConfig.FeaturesEntry + nil, // 39: metal.vmprovisioner.v1.MemoryConfig.BackingEntry + nil, // 40: metal.vmprovisioner.v1.BootConfig.BootOptionsEntry + nil, // 41: metal.vmprovisioner.v1.StorageDevice.OptionsEntry + nil, // 42: metal.vmprovisioner.v1.NetworkInterface.OptionsEntry + nil, // 43: metal.vmprovisioner.v1.GetVmInfoResponse.BackendInfoEntry + nil, // 44: metal.vmprovisioner.v1.VmInfo.MetadataEntry +} +var file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_depIdxs = []int32{ + 3, // 0: metal.vmprovisioner.v1.VmConfig.cpu:type_name -> metal.vmprovisioner.v1.CpuConfig + 5, // 1: metal.vmprovisioner.v1.VmConfig.memory:type_name -> metal.vmprovisioner.v1.MemoryConfig + 6, // 2: metal.vmprovisioner.v1.VmConfig.boot:type_name -> metal.vmprovisioner.v1.BootConfig + 7, // 3: metal.vmprovisioner.v1.VmConfig.storage:type_name -> metal.vmprovisioner.v1.StorageDevice + 8, // 4: metal.vmprovisioner.v1.VmConfig.network:type_name -> metal.vmprovisioner.v1.NetworkInterface + 12, // 5: metal.vmprovisioner.v1.VmConfig.console:type_name -> metal.vmprovisioner.v1.ConsoleConfig + 37, // 6: metal.vmprovisioner.v1.VmConfig.metadata:type_name -> metal.vmprovisioner.v1.VmConfig.MetadataEntry + 4, // 7: metal.vmprovisioner.v1.CpuConfig.topology:type_name -> metal.vmprovisioner.v1.CpuTopology + 38, // 8: metal.vmprovisioner.v1.CpuConfig.features:type_name -> metal.vmprovisioner.v1.CpuConfig.FeaturesEntry + 39, // 9: metal.vmprovisioner.v1.MemoryConfig.backing:type_name -> metal.vmprovisioner.v1.MemoryConfig.BackingEntry + 40, // 10: metal.vmprovisioner.v1.BootConfig.boot_options:type_name -> metal.vmprovisioner.v1.BootConfig.BootOptionsEntry + 41, // 11: metal.vmprovisioner.v1.StorageDevice.options:type_name -> metal.vmprovisioner.v1.StorageDevice.OptionsEntry + 42, // 12: metal.vmprovisioner.v1.NetworkInterface.options:type_name -> metal.vmprovisioner.v1.NetworkInterface.OptionsEntry + 9, // 13: metal.vmprovisioner.v1.NetworkInterface.ipv4_config:type_name -> metal.vmprovisioner.v1.IPv4Config + 10, // 14: metal.vmprovisioner.v1.NetworkInterface.ipv6_config:type_name -> metal.vmprovisioner.v1.IPv6Config + 1, // 15: metal.vmprovisioner.v1.NetworkInterface.mode:type_name -> metal.vmprovisioner.v1.NetworkMode + 11, // 16: metal.vmprovisioner.v1.NetworkInterface.rx_rate_limit:type_name -> metal.vmprovisioner.v1.RateLimit + 11, // 17: metal.vmprovisioner.v1.NetworkInterface.tx_rate_limit:type_name -> metal.vmprovisioner.v1.RateLimit + 2, // 18: metal.vmprovisioner.v1.CreateVmRequest.config:type_name -> metal.vmprovisioner.v1.VmConfig + 0, // 19: metal.vmprovisioner.v1.CreateVmResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 0, // 20: metal.vmprovisioner.v1.BootVmResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 0, // 21: metal.vmprovisioner.v1.ShutdownVmResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 0, // 22: metal.vmprovisioner.v1.PauseVmResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 0, // 23: metal.vmprovisioner.v1.ResumeVmResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 0, // 24: metal.vmprovisioner.v1.RebootVmResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 2, // 25: metal.vmprovisioner.v1.GetVmInfoResponse.config:type_name -> metal.vmprovisioner.v1.VmConfig + 0, // 26: metal.vmprovisioner.v1.GetVmInfoResponse.state:type_name -> metal.vmprovisioner.v1.VmState + 31, // 27: metal.vmprovisioner.v1.GetVmInfoResponse.metrics:type_name -> metal.vmprovisioner.v1.VmMetrics + 43, // 28: metal.vmprovisioner.v1.GetVmInfoResponse.backend_info:type_name -> metal.vmprovisioner.v1.GetVmInfoResponse.BackendInfoEntry + 30, // 29: metal.vmprovisioner.v1.GetVmInfoResponse.network_info:type_name -> metal.vmprovisioner.v1.VmNetworkInfo + 29, // 30: metal.vmprovisioner.v1.VmNetworkInfo.port_mappings:type_name -> metal.vmprovisioner.v1.PortMapping + 32, // 31: metal.vmprovisioner.v1.VmMetrics.network_stats:type_name -> metal.vmprovisioner.v1.NetworkStats + 33, // 32: metal.vmprovisioner.v1.VmMetrics.storage_stats:type_name -> metal.vmprovisioner.v1.StorageStats + 0, // 33: metal.vmprovisioner.v1.ListVmsRequest.state_filter:type_name -> metal.vmprovisioner.v1.VmState + 36, // 34: metal.vmprovisioner.v1.ListVmsResponse.vms:type_name -> metal.vmprovisioner.v1.VmInfo + 0, // 35: metal.vmprovisioner.v1.VmInfo.state:type_name -> metal.vmprovisioner.v1.VmState + 44, // 36: metal.vmprovisioner.v1.VmInfo.metadata:type_name -> metal.vmprovisioner.v1.VmInfo.MetadataEntry + 13, // 37: metal.vmprovisioner.v1.VmService.CreateVm:input_type -> metal.vmprovisioner.v1.CreateVmRequest + 15, // 38: metal.vmprovisioner.v1.VmService.DeleteVm:input_type -> metal.vmprovisioner.v1.DeleteVmRequest + 17, // 39: metal.vmprovisioner.v1.VmService.BootVm:input_type -> metal.vmprovisioner.v1.BootVmRequest + 19, // 40: metal.vmprovisioner.v1.VmService.ShutdownVm:input_type -> metal.vmprovisioner.v1.ShutdownVmRequest + 21, // 41: metal.vmprovisioner.v1.VmService.PauseVm:input_type -> metal.vmprovisioner.v1.PauseVmRequest + 23, // 42: metal.vmprovisioner.v1.VmService.ResumeVm:input_type -> metal.vmprovisioner.v1.ResumeVmRequest + 25, // 43: metal.vmprovisioner.v1.VmService.RebootVm:input_type -> metal.vmprovisioner.v1.RebootVmRequest + 27, // 44: metal.vmprovisioner.v1.VmService.GetVmInfo:input_type -> metal.vmprovisioner.v1.GetVmInfoRequest + 34, // 45: metal.vmprovisioner.v1.VmService.ListVms:input_type -> metal.vmprovisioner.v1.ListVmsRequest + 14, // 46: metal.vmprovisioner.v1.VmService.CreateVm:output_type -> metal.vmprovisioner.v1.CreateVmResponse + 16, // 47: metal.vmprovisioner.v1.VmService.DeleteVm:output_type -> metal.vmprovisioner.v1.DeleteVmResponse + 18, // 48: metal.vmprovisioner.v1.VmService.BootVm:output_type -> metal.vmprovisioner.v1.BootVmResponse + 20, // 49: metal.vmprovisioner.v1.VmService.ShutdownVm:output_type -> metal.vmprovisioner.v1.ShutdownVmResponse + 22, // 50: metal.vmprovisioner.v1.VmService.PauseVm:output_type -> metal.vmprovisioner.v1.PauseVmResponse + 24, // 51: metal.vmprovisioner.v1.VmService.ResumeVm:output_type -> metal.vmprovisioner.v1.ResumeVmResponse + 26, // 52: metal.vmprovisioner.v1.VmService.RebootVm:output_type -> metal.vmprovisioner.v1.RebootVmResponse + 28, // 53: metal.vmprovisioner.v1.VmService.GetVmInfo:output_type -> metal.vmprovisioner.v1.GetVmInfoResponse + 35, // 54: metal.vmprovisioner.v1.VmService.ListVms:output_type -> metal.vmprovisioner.v1.ListVmsResponse + 46, // [46:55] is the sub-list for method output_type + 37, // [37:46] is the sub-list for method input_type + 37, // [37:37] is the sub-list for extension type_name + 37, // [37:37] is the sub-list for extension extendee + 0, // [0:37] is the sub-list for field type_name +} + +func init() { file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_init() } +func file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_init() { + if File_proto_metal_vmprovisioner_v1_vmprovisioner_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDesc), len(file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_rawDesc)), + NumEnums: 2, + NumMessages: 43, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_goTypes, + DependencyIndexes: file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_depIdxs, + EnumInfos: file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_enumTypes, + MessageInfos: file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_msgTypes, + }.Build() + File_proto_metal_vmprovisioner_v1_vmprovisioner_proto = out.File + file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_goTypes = nil + file_proto_metal_vmprovisioner_v1_vmprovisioner_proto_depIdxs = nil +} diff --git a/go/gen/proto/metald/v1/vm.pb.go b/go/gen/proto/metald/v1/vm.pb.go index 1d20485d56..eed8b3d933 100644 --- a/go/gen/proto/metald/v1/vm.pb.go +++ b/go/gen/proto/metald/v1/vm.pb.go @@ -82,19 +82,21 @@ func (VmState) EnumDescriptor() ([]byte, []int) { type VmConfig struct { state protoimpl.MessageState `protogen:"open.v1"` // CPU configuration - Cpu int32 `protobuf:"varint,1,opt,name=cpu,proto3" json:"cpu,omitempty"` + VcpuCount uint32 `protobuf:"varint,1,opt,name=vcpu_count,json=vcpuCount,proto3" json:"vcpu_count,omitempty"` // Memory configuration - Memory int64 `protobuf:"varint,2,opt,name=memory,proto3" json:"memory,omitempty"` + MemorySizeMib uint64 `protobuf:"varint,2,opt,name=memory_size_mib,json=memorySizeMib,proto3" json:"memory_size_mib,omitempty"` // Boot configuration Boot string `protobuf:"bytes,3,opt,name=boot,proto3" json:"boot,omitempty"` // Network configuration NetworkConfig string `protobuf:"bytes,4,opt,name=network_config,json=networkConfig,proto3" json:"network_config,omitempty"` // Console configuration Console *ConsoleConfig `protobuf:"bytes,5,opt,name=console,proto3" json:"console,omitempty"` - // Metadata and labels - Metadata map[string]string `protobuf:"bytes,6,rep,name=metadata,proto3" json:"metadata,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + // Storage configuration + Storage *StorageDevice `protobuf:"bytes,6,opt,name=storage,proto3" json:"storage,omitempty"` // VM Identifier - Id string `protobuf:"bytes,7,opt,name=id,proto3" json:"id,omitempty"` + Id string `protobuf:"bytes,7,opt,name=id,proto3" json:"id,omitempty"` + // Metadata and labels + Metadata map[string]string `protobuf:"bytes,8,rep,name=metadata,proto3" json:"metadata,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -129,16 +131,16 @@ func (*VmConfig) Descriptor() ([]byte, []int) { return file_metald_v1_vm_proto_rawDescGZIP(), []int{0} } -func (x *VmConfig) GetCpu() int32 { +func (x *VmConfig) GetVcpuCount() uint32 { if x != nil { - return x.Cpu + return x.VcpuCount } return 0 } -func (x *VmConfig) GetMemory() int64 { +func (x *VmConfig) GetMemorySizeMib() uint64 { if x != nil { - return x.Memory + return x.MemorySizeMib } return 0 } @@ -164,9 +166,9 @@ func (x *VmConfig) GetConsole() *ConsoleConfig { return nil } -func (x *VmConfig) GetMetadata() map[string]string { +func (x *VmConfig) GetStorage() *StorageDevice { if x != nil { - return x.Metadata + return x.Storage } return nil } @@ -178,6 +180,13 @@ func (x *VmConfig) GetId() string { return "" } +func (x *VmConfig) GetMetadata() map[string]string { + if x != nil { + return x.Metadata + } + return nil +} + type ListVmsRequest struct { state protoimpl.MessageState `protogen:"open.v1"` // Optional filter by state @@ -244,7 +253,7 @@ type ListVmsResponse struct { state protoimpl.MessageState `protogen:"open.v1"` Vms []*VmInfo `protobuf:"bytes,1,rep,name=vms,proto3" json:"vms,omitempty"` NextPageToken string `protobuf:"bytes,2,opt,name=next_page_token,json=nextPageToken,proto3" json:"next_page_token,omitempty"` - TotalCount int64 `protobuf:"varint,3,opt,name=total_count,json=totalCount,proto3" json:"total_count,omitempty"` + TotalCount uint64 `protobuf:"varint,3,opt,name=total_count,json=totalCount,proto3" json:"total_count,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -293,7 +302,7 @@ func (x *ListVmsResponse) GetNextPageToken() string { return "" } -func (x *ListVmsResponse) GetTotalCount() int64 { +func (x *ListVmsResponse) GetTotalCount() uint64 { if x != nil { return x.TotalCount } @@ -1665,15 +1674,17 @@ var File_metald_v1_vm_proto protoreflect.FileDescriptor const file_metald_v1_vm_proto_rawDesc = "" + "\n" + - "\x12metald/v1/vm.proto\x12\tmetald.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x17metald/v1/network.proto\x1a\x17metald/v1/storage.proto\"\xaf\x02\n" + - "\bVmConfig\x12\x10\n" + - "\x03cpu\x18\x01 \x01(\x05R\x03cpu\x12\x16\n" + - "\x06memory\x18\x02 \x01(\x03R\x06memory\x12\x12\n" + + "\x12metald/v1/vm.proto\x12\tmetald.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x17metald/v1/network.proto\x1a\x17metald/v1/storage.proto\"\x80\x03\n" + + "\bVmConfig\x12\x1d\n" + + "\n" + + "vcpu_count\x18\x01 \x01(\rR\tvcpuCount\x12&\n" + + "\x0fmemory_size_mib\x18\x02 \x01(\x04R\rmemorySizeMib\x12\x12\n" + "\x04boot\x18\x03 \x01(\tR\x04boot\x12%\n" + "\x0enetwork_config\x18\x04 \x01(\tR\rnetworkConfig\x122\n" + - "\aconsole\x18\x05 \x01(\v2\x18.metald.v1.ConsoleConfigR\aconsole\x12=\n" + - "\bmetadata\x18\x06 \x03(\v2!.metald.v1.VmConfig.MetadataEntryR\bmetadata\x12\x0e\n" + - "\x02id\x18\a \x01(\tR\x02id\x1a;\n" + + "\aconsole\x18\x05 \x01(\v2\x18.metald.v1.ConsoleConfigR\aconsole\x122\n" + + "\astorage\x18\x06 \x01(\v2\x18.metald.v1.StorageDeviceR\astorage\x12\x0e\n" + + "\x02id\x18\a \x01(\tR\x02id\x12=\n" + + "\bmetadata\x18\b \x03(\v2!.metald.v1.VmConfig.MetadataEntryR\bmetadata\x1a;\n" + "\rMetadataEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x83\x01\n" + @@ -1685,7 +1696,7 @@ const file_metald_v1_vm_proto_rawDesc = "" + "\x0fListVmsResponse\x12#\n" + "\x03vms\x18\x01 \x03(\v2\x11.metald.v1.VmInfoR\x03vms\x12&\n" + "\x0fnext_page_token\x18\x02 \x01(\tR\rnextPageToken\x12\x1f\n" + - "\vtotal_count\x18\x03 \x01(\x03R\n" + + "\vtotal_count\x18\x03 \x01(\x04R\n" + "totalCount\"S\n" + "\x0fCreateVmRequest\x12\x13\n" + "\x05vm_id\x18\x01 \x01(\tR\x04vmId\x12+\n" + @@ -1846,42 +1857,44 @@ var file_metald_v1_vm_proto_goTypes = []any{ nil, // 31: metald.v1.CpuConfig.FeaturesEntry nil, // 32: metald.v1.MemoryConfig.BackingEntry nil, // 33: metald.v1.BootConfig.BootOptionsEntry - (*NetworkStats)(nil), // 34: metald.v1.NetworkStats - (*StorageStats)(nil), // 35: metald.v1.StorageStats - (*timestamppb.Timestamp)(nil), // 36: google.protobuf.Timestamp + (*StorageDevice)(nil), // 34: metald.v1.StorageDevice + (*NetworkStats)(nil), // 35: metald.v1.NetworkStats + (*StorageStats)(nil), // 36: metald.v1.StorageStats + (*timestamppb.Timestamp)(nil), // 37: google.protobuf.Timestamp } var file_metald_v1_vm_proto_depIdxs = []int32{ 27, // 0: metald.v1.VmConfig.console:type_name -> metald.v1.ConsoleConfig - 28, // 1: metald.v1.VmConfig.metadata:type_name -> metald.v1.VmConfig.MetadataEntry - 0, // 2: metald.v1.ListVmsRequest.state_filter:type_name -> metald.v1.VmState - 22, // 3: metald.v1.ListVmsResponse.vms:type_name -> metald.v1.VmInfo - 1, // 4: metald.v1.CreateVmRequest.config:type_name -> metald.v1.VmConfig - 0, // 5: metald.v1.CreateVmResponse.state:type_name -> metald.v1.VmState - 5, // 6: metald.v1.CreateVmResponse.endpoint:type_name -> metald.v1.Endpoint - 0, // 7: metald.v1.BootVmResponse.state:type_name -> metald.v1.VmState - 0, // 8: metald.v1.ShutdownVmResponse.state:type_name -> metald.v1.VmState - 0, // 9: metald.v1.PauseVmResponse.state:type_name -> metald.v1.VmState - 0, // 10: metald.v1.ResumeVmResponse.state:type_name -> metald.v1.VmState - 0, // 11: metald.v1.RebootVmResponse.state:type_name -> metald.v1.VmState - 1, // 12: metald.v1.GetVmInfoResponse.config:type_name -> metald.v1.VmConfig - 0, // 13: metald.v1.GetVmInfoResponse.state:type_name -> metald.v1.VmState - 21, // 14: metald.v1.GetVmInfoResponse.metrics:type_name -> metald.v1.VmMetrics - 29, // 15: metald.v1.GetVmInfoResponse.backend_info:type_name -> metald.v1.GetVmInfoResponse.BackendInfoEntry - 34, // 16: metald.v1.VmMetrics.network_stats:type_name -> metald.v1.NetworkStats - 35, // 17: metald.v1.VmMetrics.storage_stats:type_name -> metald.v1.StorageStats - 0, // 18: metald.v1.VmInfo.state:type_name -> metald.v1.VmState - 36, // 19: metald.v1.VmInfo.created_timestamp:type_name -> google.protobuf.Timestamp - 36, // 20: metald.v1.VmInfo.modified_timestamp:type_name -> google.protobuf.Timestamp - 30, // 21: metald.v1.VmInfo.metadata:type_name -> metald.v1.VmInfo.MetadataEntry - 24, // 22: metald.v1.CpuConfig.topology:type_name -> metald.v1.CpuTopology - 31, // 23: metald.v1.CpuConfig.features:type_name -> metald.v1.CpuConfig.FeaturesEntry - 32, // 24: metald.v1.MemoryConfig.backing:type_name -> metald.v1.MemoryConfig.BackingEntry - 33, // 25: metald.v1.BootConfig.boot_options:type_name -> metald.v1.BootConfig.BootOptionsEntry - 26, // [26:26] is the sub-list for method output_type - 26, // [26:26] is the sub-list for method input_type - 26, // [26:26] is the sub-list for extension type_name - 26, // [26:26] is the sub-list for extension extendee - 0, // [0:26] is the sub-list for field type_name + 34, // 1: metald.v1.VmConfig.storage:type_name -> metald.v1.StorageDevice + 28, // 2: metald.v1.VmConfig.metadata:type_name -> metald.v1.VmConfig.MetadataEntry + 0, // 3: metald.v1.ListVmsRequest.state_filter:type_name -> metald.v1.VmState + 22, // 4: metald.v1.ListVmsResponse.vms:type_name -> metald.v1.VmInfo + 1, // 5: metald.v1.CreateVmRequest.config:type_name -> metald.v1.VmConfig + 0, // 6: metald.v1.CreateVmResponse.state:type_name -> metald.v1.VmState + 5, // 7: metald.v1.CreateVmResponse.endpoint:type_name -> metald.v1.Endpoint + 0, // 8: metald.v1.BootVmResponse.state:type_name -> metald.v1.VmState + 0, // 9: metald.v1.ShutdownVmResponse.state:type_name -> metald.v1.VmState + 0, // 10: metald.v1.PauseVmResponse.state:type_name -> metald.v1.VmState + 0, // 11: metald.v1.ResumeVmResponse.state:type_name -> metald.v1.VmState + 0, // 12: metald.v1.RebootVmResponse.state:type_name -> metald.v1.VmState + 1, // 13: metald.v1.GetVmInfoResponse.config:type_name -> metald.v1.VmConfig + 0, // 14: metald.v1.GetVmInfoResponse.state:type_name -> metald.v1.VmState + 21, // 15: metald.v1.GetVmInfoResponse.metrics:type_name -> metald.v1.VmMetrics + 29, // 16: metald.v1.GetVmInfoResponse.backend_info:type_name -> metald.v1.GetVmInfoResponse.BackendInfoEntry + 35, // 17: metald.v1.VmMetrics.network_stats:type_name -> metald.v1.NetworkStats + 36, // 18: metald.v1.VmMetrics.storage_stats:type_name -> metald.v1.StorageStats + 0, // 19: metald.v1.VmInfo.state:type_name -> metald.v1.VmState + 37, // 20: metald.v1.VmInfo.created_timestamp:type_name -> google.protobuf.Timestamp + 37, // 21: metald.v1.VmInfo.modified_timestamp:type_name -> google.protobuf.Timestamp + 30, // 22: metald.v1.VmInfo.metadata:type_name -> metald.v1.VmInfo.MetadataEntry + 24, // 23: metald.v1.CpuConfig.topology:type_name -> metald.v1.CpuTopology + 31, // 24: metald.v1.CpuConfig.features:type_name -> metald.v1.CpuConfig.FeaturesEntry + 32, // 25: metald.v1.MemoryConfig.backing:type_name -> metald.v1.MemoryConfig.BackingEntry + 33, // 26: metald.v1.BootConfig.boot_options:type_name -> metald.v1.BootConfig.BootOptionsEntry + 27, // [27:27] is the sub-list for method output_type + 27, // [27:27] is the sub-list for method input_type + 27, // [27:27] is the sub-list for extension type_name + 27, // [27:27] is the sub-list for extension extendee + 0, // [0:27] is the sub-list for field type_name } func init() { file_metald_v1_vm_proto_init() } diff --git a/go/proto/metal/vmprovisioner/v1/randomstuff.txt b/go/proto/metal/vmprovisioner/v1/randomstuff.txt new file mode 100644 index 0000000000..dbbc3f2a94 --- /dev/null +++ b/go/proto/metal/vmprovisioner/v1/randomstuff.txt @@ -0,0 +1,13 @@ +Metald receives a CreateVMRequest, in it is the customer_id, workspace_id, and environment_id. The combination of those three hashed with FNV should yield a stable identifier that's prefixed by "br" and remains < 15 characters. That identifier should then become the name of a network bridge. + +The action taken should be to check if the bridge name exists + +# Create bridge for subnet 172.16.0.0/29 +sudo ip link add br-t0-sub0 type bridge +sudo ip addr add 172.16.0.1/29 dev br-t0-sub0 +sudo ip link set dev br-t0-sub0 up + +# Create TAPs for first subnet (5 VMs max in /29) +sudo ip tuntap add dev tap0-s0-vm1 mode tap +sudo ip link set dev tap0-s0-vm1 master br-t0-sub0 +sudo ip link set dev tap0-s0-vm1 up diff --git a/go/proto/metal/vmprovisioner/v1/vmprovisioner.proto b/go/proto/metal/vmprovisioner/v1/vmprovisioner.proto new file mode 100644 index 0000000000..7a6ef47d2f --- /dev/null +++ b/go/proto/metal/vmprovisioner/v1/vmprovisioner.proto @@ -0,0 +1,390 @@ +syntax = "proto3"; + +package metal.vmprovisioner.v1; + +option go_package = "github.com/unkeyed/unkey/go/gen/proto/metal/vmprovisioner/v1;vmprovisionerv1"; + +// VmService provides unified VM management across different hypervisor backends +service VmService { + // CreateVm creates a new virtual machine instance + rpc CreateVm(CreateVmRequest) returns (CreateVmResponse); + + // DeleteVm removes a virtual machine instance + rpc DeleteVm(DeleteVmRequest) returns (DeleteVmResponse); + + // BootVm starts a created virtual machine + rpc BootVm(BootVmRequest) returns (BootVmResponse); + + // ShutdownVm gracefully stops a running virtual machine + rpc ShutdownVm(ShutdownVmRequest) returns (ShutdownVmResponse); + + // PauseVm pauses a running virtual machine + rpc PauseVm(PauseVmRequest) returns (PauseVmResponse); + + // ResumeVm resumes a paused virtual machine + rpc ResumeVm(ResumeVmRequest) returns (ResumeVmResponse); + + // RebootVm restarts a running virtual machine + rpc RebootVm(RebootVmRequest) returns (RebootVmResponse); + + // GetVmInfo retrieves virtual machine status and configuration + rpc GetVmInfo(GetVmInfoRequest) returns (GetVmInfoResponse); + + // ListVms lists all virtual machines managed by this service + rpc ListVms(ListVmsRequest) returns (ListVmsResponse); +} + +// VM lifecycle states +enum VmState { + VM_STATE_UNSPECIFIED = 0; + VM_STATE_CREATED = 1; + VM_STATE_RUNNING = 2; + VM_STATE_PAUSED = 3; + VM_STATE_SHUTDOWN = 4; +} + +// Unified VM configuration that works across different hypervisors +message VmConfig { + // CPU configuration + CpuConfig cpu = 1; + + // Memory configuration + MemoryConfig memory = 2; + + // Boot configuration + BootConfig boot = 3; + + // Storage devices + repeated StorageDevice storage = 4; + + // Network interfaces + repeated NetworkInterface network = 5; + + // Console configuration + ConsoleConfig console = 6; + + // Metadata and labels + map metadata = 7; +} + +message CpuConfig { + // Number of virtual CPUs to allocate at boot + int32 vcpu_count = 1; + + // Maximum number of virtual CPUs (for hotplug) + int32 max_vcpu_count = 2; + + // CPU topology (optional) + CpuTopology topology = 3; + + // CPU features and model (backend-specific) + map features = 4; +} + +message CpuTopology { + int32 sockets = 1; + int32 cores_per_socket = 2; + int32 threads_per_core = 3; +} + +message MemoryConfig { + // Memory size in bytes + int64 size_bytes = 1; + + // Whether memory hotplug is enabled + bool hotplug_enabled = 2; + + // Maximum memory size for hotplug (bytes) + int64 max_size_bytes = 3; + + // Memory backing options (hugepages, etc.) + map backing = 4; +} + +message BootConfig { + // Path to kernel image + string kernel_path = 1; + + // Path to initial ramdisk (optional) + string initrd_path = 2; + + // Kernel command line arguments + string kernel_args = 3; + + // Boot order and options + map boot_options = 4; +} + +message StorageDevice { + // Unique identifier for this storage device + string id = 1; + + // Path to the backing file or block device + string path = 2; + + // Whether this device is read-only + bool read_only = 3; + + // Whether this is the root/boot device + bool is_root_device = 4; + + // Storage interface type (virtio-blk, nvme, etc.) + string interface_type = 5; + + // Additional storage options + map options = 6; +} + +message NetworkInterface { + // Unique identifier for this network interface + string id = 1; + + // MAC address (optional, will be generated if not provided) + string mac_address = 2; + + // Host-side TAP device name + string tap_device = 3; + + // Network interface type (virtio-net, e1000, etc.) + string interface_type = 4; + + // Additional network options + map options = 5; + + // IPv4 configuration (optional) + IPv4Config ipv4_config = 6; + + // IPv6 configuration (optional) + IPv6Config ipv6_config = 7; + + // Network mode + NetworkMode mode = 8; + + // Rate limiting + RateLimit rx_rate_limit = 10; // Receive rate limit + RateLimit tx_rate_limit = 11; // Transmit rate limit +} + +// IPv4 network configuration +message IPv4Config { + string address = 1; // IPv4 address (e.g., "10.100.1.2") + string netmask = 2; // Network mask (e.g., "255.255.255.0") + string gateway = 3; // Default gateway + repeated string dns_servers = 4; // DNS servers + bool dhcp = 5; // Use DHCP instead of static config +} + +// IPv6 network configuration +message IPv6Config { + string address = 1; // IPv6 address (e.g., "fd00::1:2") + int32 prefix_length = 2; // Prefix length (e.g., 64) + string gateway = 3; // Default gateway + repeated string dns_servers = 4; // DNS servers (IPv6 addresses) + bool slaac = 5; // Use SLAAC (Stateless Address Autoconfiguration) + bool privacy_extensions = 6; // Enable privacy extensions + string link_local = 7; // Link-local address (auto-generated if empty) +} + +// Network mode for the interface +enum NetworkMode { + NETWORK_MODE_UNSPECIFIED = 0; + NETWORK_MODE_DUAL_STACK = 1; // Both IPv4 and IPv6 + NETWORK_MODE_IPV4_ONLY = 2; // IPv4 only + NETWORK_MODE_IPV6_ONLY = 3; // IPv6 only +} + +// Rate limiting configuration +message RateLimit { + int64 bandwidth = 1; // Bandwidth in bytes/second + int64 refill_time = 2; // Token bucket refill time in milliseconds + int64 burst = 3; // Burst size in bytes +} + +message ConsoleConfig { + // Whether console is enabled + bool enabled = 1; + + // Console output destination (file path, pty, etc.) + string output = 2; + + // Console input source (optional) + string input = 3; + + // Console type (serial, virtio-console, etc.) + string console_type = 4; +} + +// Request/Response messages +message CreateVmRequest { + // Unique identifier for the VM (optional, will be generated if not provided) + string vm_id = 1; + + // VM configuration + VmConfig config = 2; + + // Customer identifier for billing and isolation + string customer_id = 3; +} + +message CreateVmResponse { + // Assigned VM identifier + string vm_id = 1; + + // Current VM state after creation + VmState state = 2; +} + +message DeleteVmRequest { + string vm_id = 1; + + // Whether to force deletion even if VM is running + bool force = 2; +} + +message DeleteVmResponse { bool success = 1; } + +message BootVmRequest { string vm_id = 1; } + +message BootVmResponse { + bool success = 1; + VmState state = 2; +} + +message ShutdownVmRequest { + string vm_id = 1; + + // Whether to force shutdown (vs graceful) + bool force = 2; + + // Timeout for graceful shutdown (seconds) + int32 timeout_seconds = 3; +} + +message ShutdownVmResponse { + bool success = 1; + VmState state = 2; +} + +message PauseVmRequest { string vm_id = 1; } + +message PauseVmResponse { + bool success = 1; + VmState state = 2; +} + +message ResumeVmRequest { string vm_id = 1; } + +message ResumeVmResponse { + bool success = 1; + VmState state = 2; +} + +message RebootVmRequest { + string vm_id = 1; + + // Whether to force reboot (vs graceful) + bool force = 2; +} + +message RebootVmResponse { + bool success = 1; + VmState state = 2; +} + +message GetVmInfoRequest { string vm_id = 1; } + +message GetVmInfoResponse { + string vm_id = 1; + VmConfig config = 2; + VmState state = 3; + VmMetrics metrics = 4; + + // Backend-specific information + map backend_info = 5; + + // Network information if available + VmNetworkInfo network_info = 6; +} + +// Port mapping for VM network forwarding +message PortMapping { + int32 container_port = 1; // Port inside the VM + int32 host_port = 2; // Port on the host system + string protocol = 3; // Protocol (tcp, udp) +} + +// Network information for a VM +message VmNetworkInfo { + string ip_address = 1; + string mac_address = 2; + string tap_device = 3; + string network_namespace = 4; + string gateway = 5; + repeated string dns_servers = 6; + repeated PortMapping port_mappings = 7; // Port forwards from host to VM +} + +message VmMetrics { + // CPU usage percentage (0-100) + double cpu_usage_percent = 1; + + // Memory usage in bytes + int64 memory_usage_bytes = 2; + + // Network I/O statistics + NetworkStats network_stats = 3; + + // Storage I/O statistics + StorageStats storage_stats = 4; + + // VM uptime in seconds + int64 uptime_seconds = 5; +} + +message NetworkStats { + int64 bytes_received = 1; + int64 bytes_transmitted = 2; + int64 packets_received = 3; + int64 packets_transmitted = 4; +} + +message StorageStats { + int64 bytes_read = 1; + int64 bytes_written = 2; + int64 read_operations = 3; + int64 write_operations = 4; +} + +message ListVmsRequest { + // Optional filter by state + repeated VmState state_filter = 1; + + // Pagination + int32 page_size = 2; + string page_token = 3; +} + +message ListVmsResponse { + repeated VmInfo vms = 1; + string next_page_token = 2; + int32 total_count = 3; +} + +message VmInfo { + string vm_id = 1; + VmState state = 2; + + // Basic config info (subset of full config) + int32 vcpu_count = 3; + int64 memory_size_bytes = 4; + + // Creation and modification timestamps + int64 created_timestamp = 5; + int64 modified_timestamp = 6; + + // Metadata + map metadata = 7; + + // Customer identifier + string customer_id = 8; +} diff --git a/go/proto/metald/v1/vm.proto b/go/proto/metald/v1/vm.proto index 8f2d2dc00f..6f3aba89a4 100644 --- a/go/proto/metald/v1/vm.proto +++ b/go/proto/metald/v1/vm.proto @@ -20,10 +20,10 @@ enum VmState { // Unified VM configuration that works across different hypervisors message VmConfig { // CPU configuration - int32 cpu = 1; + uint32 vcpu_count = 1; // Memory configuration - int64 memory = 2; + uint64 memory_size_mib = 2; // Boot configuration string boot = 3; @@ -34,11 +34,14 @@ message VmConfig { // Console configuration ConsoleConfig console = 5; - // Metadata and labels - map metadata = 6; + // Storage configuration + StorageDevice storage = 6; // VM Identifier string id = 7; + + // Metadata and labels + map metadata = 8; } message ListVmsRequest { @@ -53,7 +56,7 @@ message ListVmsRequest { message ListVmsResponse { repeated VmInfo vms = 1; string next_page_token = 2; - int64 total_count = 3; + uint64 total_count = 3; } // Request/Response messages @@ -65,7 +68,11 @@ message CreateVmRequest { VmConfig config = 2; } -message Endpoint { string host = 1; uint32 port = 2;} +message Endpoint { + string host = 1; + uint32 port = 2; +} + message CreateVmResponse { // Current VM state after creation VmState state = 1;