diff --git a/Makefile b/Makefile index 2af9daf271..ccfeffa4bd 100644 --- a/Makefile +++ b/Makefile @@ -152,7 +152,11 @@ test-e2e-server: ## Run model-serving-api e2e tests. Requires the server deploye echo "METRICS_URL=$$MODEL_SERVING_API_METRICS_URL" && \ MODEL_SERVING_API_URL=$$MODEL_SERVING_API_URL \ MODEL_SERVING_API_METRICS_URL=$$MODEL_SERVING_API_METRICS_URL \ - go test -v -tags=e2e -parallel=12 ./server/test/e2e/ -v -count=1 + go test -v -tags=e2e -timeout=10m -parallel=12 ./server/test/e2e/ -v -count=1 + +.PHONY: test-e2e-controller +test-e2e-controller: ## Run controller e2e tests. Requires controller + gateway + Authorino deployed. + go test -v -tags=e2e -timeout=10m -parallel=12 ./internal/controller/test/e2e/ -count=1 .PHONY: e2e-kserve-overlay e2e-kserve-overlay: kustomize ## Create a kustomize overlay injecting the controller image for e2e tests. @@ -187,6 +191,7 @@ test-e2e-kserve-ocp: e2e-kserve-overlay ## Run KServe e2e tests on OpenShift. echo "=== Running KServe E2E Tests ====== '$(KSERVE_E2E_TEST_ARGS)'" && \ ./test/scripts/openshift-ci/run-e2e-tests.sh $(KSERVE_E2E_TEST_ARGS) $(MAKE) test-e2e-server + $(MAKE) test-e2e-controller .PHONY: lint lint: golangci-lint ## Run golangci-lint linter diff --git a/go.mod b/go.mod index dc809fcfc9..3bdbad61c3 100644 --- a/go.mod +++ b/go.mod @@ -101,6 +101,7 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect github.com/googleapis/gax-go/v2 v2.15.0 // indirect github.com/googleapis/google-cloud-go-testing v0.0.0-20210719221736-1c9a4c676720 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 // indirect @@ -115,9 +116,11 @@ require ( github.com/llm-d/llm-d-workload-variant-autoscaler v0.5.1 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/martinlindhe/base36 v1.1.1 // indirect + github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect diff --git a/go.sum b/go.sum index 6ad17185b0..8205630e56 100644 --- a/go.sum +++ b/go.sum @@ -61,6 +61,8 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk= github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -186,6 +188,8 @@ github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81 github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/googleapis/google-cloud-go-testing v0.0.0-20210719221736-1c9a4c676720 h1:zC34cGQu69FG7qzJ3WiKW244WfhDC3xxYMeNOX2gtUQ= github.com/googleapis/google-cloud-go-testing v0.0.0-20210719221736-1c9a4c676720/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= @@ -252,6 +256,8 @@ github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3v github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= +github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -260,6 +266,8 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= diff --git a/internal/controller/resources/template/BATCH.md b/internal/controller/resources/template/BATCH.md new file mode 100644 index 0000000000..1b2719487d --- /dev/null +++ b/internal/controller/resources/template/BATCH.md @@ -0,0 +1,920 @@ +# Batch APIs Authn/z + +Extend `authpolicy_llm_isvc_userdefined.yaml` to handle OpenAI-compatible batch API paths +with authentication and header injection. + +## Batch API paths + +``` +/v1/files GET, POST +/v1/files/{file_id} GET, DELETE +/v1/files/{file_id}/content GET +/v1/batches GET, POST +/v1/batches/{batch_id} GET +/v1/batches/{batch_id}/cancel POST +``` + +## Authorization model + +All batch API paths use **authentication only** at the gateway level (Kubernetes TokenReview). +No gateway-level authorization rule is needed — the batch service enforces access control +internally via tenant isolation (`TenantID` from `X-MaaS-User`): + +- Every DB query filters by `TenantID` +- Accessing a file/batch belonging to another user returns 404 +- Storage is isolated per tenant via `SHA256(tenantID)` folder names + +## Header injection + +For **all** batch API paths, inject these response headers: + +| Header | Source | Notes | +|-----------------|-------------------------------|------------------------------------------| +| `X-MaaS-User` | `auth.identity.user.username` | CEL expression in `plain` | +| `X-MaaS-Groups` | `auth.identity.user.groups` | Needs comma-separated output (see below) | + +### X-MaaS-Groups: comma-separated list + +`auth.identity.user.groups` is an array. To produce a comma-separated string, use a CEL +`join()` call: + +```yaml +x-maas-groups: + plain: + expression: "auth.identity.user.groups.join(',')" +``` + +### Scoping headers to batch paths only + +These headers must only be injected for batch API paths. Use `when` predicates on each +response header definition: + +```yaml +x-maas-user: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: auth.identity.user.username +``` + +## Impact on existing inference-access rule + +The current `inference-access` rule applies to all paths and interprets +`request.path.split("/")[1]` as namespace and `[2]` as ISVC name. Batch paths would produce +nonsensical lookups (namespace=`v1`, name=`files`). + +Add a `when` predicate to exclude batch paths: + +```yaml +authorization: + inference-access: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + kubernetesSubjectAccessReview: + # ... existing spec unchanged ... +``` + +When the `inference-access` rule is skipped (batch paths), no authorization rules remain, +so Authorino implicitly allows the request — authentication-only behavior. + +## Full template sketch + +Below is the authorization + response section of the updated +`authpolicy_llm_isvc_userdefined.yaml` (authentication section unchanged): + +```yaml + authorization: + inference-access: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + kubernetesSubjectAccessReview: + user: + expression: auth.identity.user.username + authorizationGroups: + expression: auth.identity.user.groups + resourceAttributes: + group: + value: serving.kserve.io + resource: + value: llminferenceservices + namespace: + expression: request.path.split("/")[1] + name: + expression: request.path.split("/")[2] + verb: + value: get + priority: 1 + response: + success: + headers: + # Existing inference headers + x-gateway-inference-fairness-id: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + metrics: false + plain: + expression: auth.identity.fairness + priority: 0 + x-gateway-inference-objective: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + metrics: false + plain: + expression: auth.identity.objective + priority: 0 + # Batch headers + x-maas-user: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: auth.identity.user.username + priority: 0 + x-maas-groups: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: "auth.identity.user.groups.join(',')" + priority: 0 +``` + +## Upstream fix: Authorino response headers must overwrite, not append + +Authorino's response header injection currently **appends** to existing request headers instead +of overwriting them. If a client sends `x-maas-user: attacker` and Authorino injects +`x-maas-user: real-user`, the backend receives `attacker, real-user`. + +This affects **all** response headers — both batch headers (`x-maas-user`, `x-maas-groups`) and +flow control headers (`x-gateway-inference-fairness-id`, `x-gateway-inference-objective`). + +### Root cause + +In `authorino@v0.20.0/pkg/service/auth.go:420-435`, `buildResponseHeaders()` constructs +`HeaderValueOption` entries without setting the `AppendAction` field: + +```go +package main + +responseHeaders = append(responseHeaders, &envoy_core.HeaderValueOption{ + Header: &envoy_core.HeaderValue{ + Key: key, + Value: value, + }, +}) +``` + +The Envoy `HeaderValueOption` has two fields controlling this behavior: + +| Field | Default (ext_authz context) | Status | +|----------------|------------------------------------|------------| +| `Append` | `false` (overwrite) | Deprecated | +| `AppendAction` | `APPEND_IF_EXISTS_OR_ADD` (append) | Current | + +Since Authorino sets neither, Envoy uses the `AppendAction` default: **append**. + +### Required fix + +Authorino must explicitly set `AppendAction: OVERWRITE_IF_EXISTS_OR_ADD` on every +`HeaderValueOption`: + +```go +package main + +responseHeaders = append(responseHeaders, &envoy_core.HeaderValueOption{ + Header: &envoy_core.HeaderValue{ + Key: key, + Value: value, + }, + AppendAction: envoy_core.HeaderValueOption_OVERWRITE_IF_EXISTS_OR_ADD, +}) +``` + +This ensures Authorino-injected headers replace any client-supplied values with the same name, +preventing header injection attacks and multi-value concatenation. + +### Workaround: take the last header value + +Until the upstream fix lands, consumers of Authorino-injected headers must always use the +**last** value in multi-value headers. + +This is secure because Envoy's ext_authz pipeline guarantees ordering: client-supplied headers +come first, Authorino-injected values are appended last. A client cannot inject headers after +ext_authz processing, so the last value is always the trusted Authorino-injected one. + +Example: a client sends `x-maas-user: attacker`. After Authorino appends, the backend receives +`attacker, real-user`. Taking the last comma-separated value yields `real-user`. + +**Affected consumers**: + +| Header | Consumer | Status | +|-----------------------------------|--------------------------|--------------------------| +| `x-maas-user` | Batch service middleware | Needs fix | +| `x-maas-groups` | Batch service middleware | Needs fix | +| `x-gateway-inference-fairness-id` | EPP | Already safe (see below) | +| `x-gateway-inference-objective` | EPP | Already safe (see below) | + +**EPP is already safe**: the EPP reads headers via Envoy ext_proc, which delivers duplicate +headers as separate entries in the `Headers` slice. The EPP iterates and overwrites +(`reqCtx.Request.Headers[header.Key] = value`), so the last entry (Authorino's) wins +(`epp/handlers/request.go:57-67`). + +**Batch service** needs to take the last value when reading `X-MaaS-User` and `X-MaaS-Groups` +from request headers (split by `, ` and take the last element, or equivalent). + +Batch Gateway fixes + +- https://github.com/llm-d-incubation/batch-gateway/pull/87 +- https://github.com/llm-d-incubation/batch-gateway/pull/92 + +## Batch-gateway service compatibility + +The batch-gateway service (`batch-gateway` repo) implements the OpenAI `/v1/files` and +`/v1/batches` endpoints. The following changes are needed to make it compatible with the +Authorino authn/z flow described above. + +### 1. What already works — no changes needed + +- **ID format**: file IDs (`file_`) and batch IDs (`batch_`) remain unchanged. +- **Storage isolation**: `GetFolderNameByTenantID()` (`com.go:40-55`) uses `SHA256(tenantID)` + to derive folder names. +- **DB-level tenant isolation**: every query includes `TenantID` in the filter. A user + accessing a file/batch belonging to another user gets 404. +- **Listing** (`GET /v1/files`, `GET /v1/batches`): `ListFiles()` and `ListBatches()` already + filter by `TenantID` (from `X-MaaS-User`). Results are scoped to the calling user. + Pagination uses integer cursors, not IDs. +- **Batch input file reference**: `POST /v1/batches` references an `input_file_id` in the + request body. The batch service looks up the file by full ID with tenant isolation. + +## Batch processor → gateway → inference backend authn/z + +The batch processor (`cmd/batch-processor/main.go`) dequeues jobs and sends individual +inference requests to model serving backends. These requests must go **through the gateway** +so that Authorino handles per-model authorization using the original user's identity. + +``` +User ──► Gateway (Authorino) ──► Batch API server (batch creation) + │ + ▼ + Queue + │ + ▼ + Batch Processor + │ + ▼ + Gateway (Authorino) (per-model authz) ──► Inference Backend +``` + +### Current state + +- **Routing**: processor sends requests directly to the backend, bypassing the gateway + (`inference/client.go` — `gateway_url` config points to the backend, not the gateway). +- **Authentication**: shared API key (`inference/client.go:94`), not a Kubernetes token. +- **Authorization**: none — the processor does not check whether the original user has + permission to access the models referenced in the batch. +- **User context**: the API server stores `TenantID` (username) with the batch record, but + the processor does not forward `X-MaaS-User` or `X-MaaS-Groups` to the backend. + +### Required changes + +#### 1. Store user context at batch creation time + +The API server must persist the user's identity with the batch job so the processor can +forward it later. Currently only `TenantID` (username) is stored. + +**Needed fields** (in `BatchItem` / `BatchSpec` or a new `UserContext` struct): + +| Field | Source header | Purpose | +|------------|-----------------|----------------------------------------| +| `Username` | `X-MaaS-User` | Forwarded to gateway for delegated SAR | +| `Groups` | `X-MaaS-Groups` | Forwarded to gateway for delegated SAR | + +These headers are injected by Authorino at batch creation time and should be extracted by +the API server middleware and stored alongside the batch record. + +**Files**: `batch_handler.go` (CreateBatch), `batch_item.go` (storage schema) + +#### 2. Route inference requests through the gateway + +Change the processor's target URL from the inference backend to the gateway. + +```yaml +# cmd/batch-processor/config.yaml +inference_config: + gateway_url: "https://" # was: http://localhost:8000 (direct to backend) +``` + +The processor authenticates to the gateway with its **own ServiceAccount token** (not the +original user's token, which may have expired). It forwards the original user's identity +via `X-MaaS-User` and `X-MaaS-Groups` headers. + +```go +package inference + +// inference/client.go – per-request setup +saToken := getServiceAccountToken() // mounted at /var/run/secrets/kubernetes.io/serviceaccount/token +req.Header.Set("Authorization", "Bearer " + saToken) +req.Header.Set("X-MaaS-User", job.UserContext.Username) +req.Header.Set("X-MaaS-Groups", job.UserContext.Groups) +``` + +**File**: `inference/client.go` (replace API key auth with SA token + forwarded headers) + +#### 3. Two authorization rules: base access + delegation + +The gateway uses **two** authorization rules evaluated with AND semantics (both must pass). +This ensures proper separation of concerns: + +- **Rule 1 (`inference-access`)**: checks the **forwarded user** (or authenticated user if + no header) for `get llminferenceservices` — can they access this model? +- **Rule 2 (`inference-access-delegate`)**: only fires when `x-maas-user` is present, checks + the **authenticated caller** for `post-delegate llminferenceservices/delegate` — are they + a trusted delegator (e.g., the batch processor)? + +| Scenario | Rule 1 (`inference-access`) | Rule 2 (`inference-access-delegate`) | Result | +|-------------------------------|----------------------------------------------------|-----------------------------------------------------------------------|-----------------------------------| +| Standard (no `x-maas-user`) | `get llminferenceservices` for authenticated user | **skipped** (when predicate false) | Pass if user has `get` | +| Delegated (`x-maas-user` set) | `get llminferenceservices` for forwarded user | `post-delegate llminferenceservices/delegate` for authenticated caller | Pass if both checks pass | +| Batch path | **skipped** (batch path excluded) | **skipped** (batch path excluded) | Authn-only | + +**Groups handling**: `x-maas-user` and `x-maas-groups` are treated as an all-or-nothing pair +in rule 1. When `x-maas-user` is present but `x-maas-groups` is absent, groups default to +`[]` (empty) rather than leaking the authenticated caller's groups. This prevents creating +a hybrid principal (forwarded user's identity with the caller's group memberships). + +```yaml +authorization: + # Rule 1: base inference access — always fires on inference paths. + # Checks that the effective user (forwarded or authenticated) can + # get the target llminferenceservice. + inference-access: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + kubernetesSubjectAccessReview: + user: + expression: "'x-maas-user' in request.headers ? request.headers['x-maas-user'] : auth.identity.user.username" + authorizationGroups: + expression: "'x-maas-user' in request.headers ? ('x-maas-groups' in request.headers ? request.headers['x-maas-groups'].split(',') : []) : auth.identity.user.groups" + resourceAttributes: + group: + value: serving.kserve.io + resource: + value: llminferenceservices + namespace: + expression: request.path.split("/")[1] + name: + expression: request.path.split("/")[2] + verb: + value: get + priority: 1 + + # Rule 2: delegate access — only fires when x-maas-user header is present. + # Checks that the authenticated caller (batch processor) has the delegate permission. + inference-access-delegate: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + - predicate: "'x-maas-user' in request.headers" + kubernetesSubjectAccessReview: + user: + expression: "auth.identity.user.username" + authorizationGroups: + expression: "auth.identity.user.groups" + resourceAttributes: + group: + value: serving.kserve.io + resource: + value: llminferenceservices/delegate + namespace: + expression: request.path.split("/")[1] + name: + expression: request.path.split("/")[2] + verb: + value: post-delegate + priority: 1 +``` + +**Security model**: the two-rule design enforces proper delegation: + +- Rule 1 ensures the **forwarded user** can `get` the target model — without this, + a batch processor could send requests on behalf of a user who has no access. +- Rule 2 ensures the **authenticated caller** has `post-delegate llminferenceservices/delegate`, + which should only be granted to trusted delegators like the batch processor SA. +- **Header spoofing is blocked**: a regular user spoofing `x-maas-user` would trigger rule 2, + which checks the **caller's** (not the forwarded user's) `post-delegate` permission. + Since regular users don't have `post-delegate`, the request is rejected with 403. + +#### 4. Flow control: Fairness and objective + +Because the processor now routes through the gateway, Authorino injects +`x-gateway-inference-fairness-id` and `x-gateway-inference-objective` headers automatically +via the existing response rules. However, the current fairness/objective values are derived +from the processor's ServiceAccount token — not the original user. + +The `fairness` override in the authentication section is set to `{{.Issuer}}` and the +`objective` override uses a CEL expression that extracts the namespace from ServiceAccount +usernames. For processor requests, this would produce the processor's namespace rather than +the original user's context. + +**Options**: + +- **Batch-class fairness**: accept this behavior — all batch requests get grouped under the + processor's identity, forming a single fairness bucket separate from interactive traffic. + This is the simplest approach and gives operators a clear knob to control batch vs + interactive priority via `InferenceObjective`. +- **Per-user fairness**: override the fairness/objective values for delegated requests using + the forwarded headers. This requires additional response header rules gated on the + processor's SA identity, similar to the delegated authorization rule. +- ... a few others + +## Full rendered AuthPolicy example + +Below is the complete AuthPolicy as it would appear in the cluster, based on the existing +`openshift-ai-inference-authn` policy with all batch changes applied. Template variables are +rendered with example values. + +```yaml +apiVersion: kuadrant.io/v1 +kind: AuthPolicy +metadata: + name: openshift-ai-inference-authn + namespace: openshift-ingress +spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: openshift-ai-inference + rules: + authentication: + kubernetes-user: + kubernetesTokenReview: + audiences: + - https://kubernetes.default.svc + overrides: + fairness: + value: "https://kubernetes.default.svc" + objective: + expression: "auth.identity.user.username.startsWith('system:serviceaccount:') ? auth.identity.user.username.split(':')[2] : 'authenticated'" + + authorization: + # Rule 1: base inference access — forwarded user (or caller) must have get + inference-access: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + kubernetesSubjectAccessReview: + user: + expression: "'x-maas-user' in request.headers ? request.headers['x-maas-user'] : auth.identity.user.username" + authorizationGroups: + expression: "'x-maas-user' in request.headers ? ('x-maas-groups' in request.headers ? request.headers['x-maas-groups'].split(',') : []) : auth.identity.user.groups" + resourceAttributes: + group: + value: serving.kserve.io + resource: + value: llminferenceservices + namespace: + expression: request.path.split("/")[1] + name: + expression: request.path.split("/")[2] + verb: + value: get + priority: 1 + + # Rule 2: delegate access — caller must have post-delegate (only when x-maas-user present) + inference-access-delegate: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + - predicate: "'x-maas-user' in request.headers" + kubernetesSubjectAccessReview: + user: + expression: "auth.identity.user.username" + authorizationGroups: + expression: "auth.identity.user.groups" + resourceAttributes: + group: + value: serving.kserve.io + resource: + value: llminferenceservices/delegate + namespace: + expression: request.path.split("/")[1] + name: + expression: request.path.split("/")[2] + verb: + value: post-delegate + priority: 1 + + response: + success: + headers: + # Flow control headers + x-gateway-inference-fairness-id: + metrics: false + plain: + expression: auth.identity.fairness + priority: 0 + x-gateway-inference-objective: + metrics: false + plain: + expression: auth.identity.objective + priority: 0 + + # Batch headers — only for batch paths + x-maas-user: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: auth.identity.user.username + priority: 0 + x-maas-groups: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: "auth.identity.user.groups.join(',')" + priority: 0 +``` + +**Example values used above**: + +| Template variable | Rendered value | +|-------------------------------------|----------------------------------------------------------------------------------------------------------------------------------| +| `{{.Name}}` | `openshift-ai-inference-authn` | +| `{{.GatewayNamespace}}` | `openshift-ingress` | +| `{{.GatewayName}}` | `openshift-ai-inference` | +| `{{.AudiencesJSON}}` | `["https://kubernetes.default.svc"]` | +| `{{.Issuer}}` | `https://kubernetes.default.svc` | +| `{{.ObjectiveExpression}}` | `auth.identity.user.username.startsWith('system:serviceaccount:') ? auth.identity.user.username.split(':')[2] : 'authenticated'` | + +## Testing + +### Test infrastructure: echo service + +Deploy an echo server behind the gateway to verify Authorino behavior. The echo server +reflects request headers and body, making it easy to inspect what Authorino injected. + +All test identities are ServiceAccounts with specific RBAC — no dependency on logged-in +user tokens or `oc whoami -t`. + +#### ServiceAccounts + +| SA | Namespace | Purpose | RBAC | +|----------------------|----------------|-----------------------------------------------------------|----------------------------------------------------------------------------------------------| +| `test-user` | `echo-service` | Regular user with standard inference access only | `get llminferenceservices` in `echo-service` | +| `test-user-delegate` | `echo-service` | Batch processor SA (trusted delegator + inference access) | `get llminferenceservices` + `post-delegate llminferenceservices/delegate` in `echo-service` | + +#### Test scenarios + +| # | Scenario | SA token | Path | Extra headers | Expected SARs | Expected | +|---|-----------------------------------------------------------------|----------------------|---------------------------------|--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------| +| 1 | SA → batch path | `test-user` | `/v1/batches` | — | Skipped (batch path) | 200, `x-maas-user` and `x-maas-groups` injected | +| 2 | SA → inference path (standard) | `test-user` | `/echo-service/echo-server/...` | — | Rule 1: `get llminferenceservices` for `test-user` | 200 | +| 3 | Delegated: caller has `post-delegate`, forwarded user has `get` | `test-user-delegate` | `/echo-service/echo-server/...` | `x-maas-user: ...test-user` | Rule 1: `get llminferenceservices` for `test-user` ✓ / Rule 2: `post-delegate llminferenceservices/delegate` for `test-user-delegate` ✓ | 200 | +| 4 | SA → batch path with spoofed header | `test-user` | `/v1/batches` | `x-maas-user: spoofed` | Skipped (batch path) | 200, Authorino appends real identity after spoofed value | +| 5 | Delegated: forwarded user has no RBAC | `test-user-delegate` | `/echo-service/echo-server/...` | `x-maas-user: nonexistent-user` | Rule 1: `get llminferenceservices` for `nonexistent-user` ✗ | 403 | +| 6 | Delegated: caller lacks `post-delegate` | `test-user` | `/echo-service/echo-server/...` | `x-maas-user: ...test-user-delegate` | Rule 1: `get llminferenceservices` for `test-user-delegate` ✓ / Rule 2: `post-delegate llminferenceservices/delegate` for `test-user` ✗ | 403 | + +#### gateway.yaml + +The GatewayClass and Gateway that the tests assume are already deployed. On OpenShift, the +`openshift-default` GatewayClass is provided by the platform. The Gateway must have Authorino +configured (AuthPolicy applied by odh-model-controller). + +```yaml +# Gateway infrastructure for Authorino-based authn/z testing. +# +# On OpenShift the GatewayClass already exists — only apply the Gateway. +# +# Apply: +# oc apply -f gateway.yaml +# +# --- GatewayClass (already exists on OCP — shown for reference) --- +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: openshift-default +spec: + controllerName: openshift.io/gateway-controller/v1 +--- +# --- Gateway --- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: openshift-ai-inference + namespace: openshift-ingress +spec: + gatewayClassName: openshift-default + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: All +``` + +The listener should set `allowedRoutes.namespaces.from: All` to accept HTTPRoutes from any +namespace. Without this, the default is `Same` (only the gateway's own namespace), and +cross-namespace HTTPRoutes will be rejected with `NotAllowedByListeners`. + +The internal address of the gateway (for in-cluster testing) follows the OCP naming convention: + +``` +http://openshift-ai-inference-openshift-default.openshift-ingress.svc.cluster.local +``` + +The e2e tests discover this address automatically from the Gateway resource's +`.status.addresses[0].value`. + +#### echo-service.yaml + +```yaml +# Echo service + test ServiceAccounts for verifying Authorino batch authn/z. +# +# Apply: +# oc apply -f echo-service.yaml +# +# --- Namespace --- +apiVersion: v1 +kind: Namespace +metadata: + name: echo-service +--- +# --- ServiceAccounts --- +# Regular user — has standard inference access only +apiVersion: v1 +kind: ServiceAccount +metadata: + name: test-user + namespace: echo-service +--- +# User with delegate permission — eligible for batch inference +apiVersion: v1 +kind: ServiceAccount +metadata: + name: test-user-delegate + namespace: echo-service +--- +# --- Echo server --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: echo-server + namespace: echo-service + labels: + app.kubernetes.io/name: echo-server +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: echo-server + template: + metadata: + labels: + app.kubernetes.io/name: echo-server + spec: + containers: + - name: echo-server + image: ealen/echo-server:0.9.2 + ports: + - containerPort: 8080 + protocol: TCP + env: + - name: PORT + value: "8080" + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: echo-server + namespace: echo-service +spec: + selector: + app.kubernetes.io/name: echo-server + ports: + - port: 80 + targetPort: 8080 + protocol: TCP +--- +# --- HTTPRoutes --- +# Route batch API paths to the echo server +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: echo-server-batch + namespace: echo-service +spec: + parentRefs: + - name: openshift-ai-inference + namespace: openshift-ingress + rules: + - matches: + - path: + type: PathPrefix + value: /v1/batches + backendRefs: + - name: echo-server + port: 80 + - matches: + - path: + type: PathPrefix + value: /v1/files + backendRefs: + - name: echo-server + port: 80 +--- +# Route a fake inference path to the echo server for testing the +# conditional authorization rule (standard vs delegated SAR). +# Path format: /echo-service/echo-server/... so the SAR extracts +# namespace=echo-service, name=echo-server from the path. +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: echo-server-inference + namespace: echo-service +spec: + parentRefs: + - name: openshift-ai-inference + namespace: openshift-ingress + rules: + - matches: + - path: + type: PathPrefix + value: /echo-service/echo-server + backendRefs: + - name: echo-server + port: 80 +``` + +#### echo-service-rbac.yaml + +RBAC resources for the test ServiceAccounts. Two ClusterRoles define the two +permission levels; RoleBindings in `echo-service` namespace grant them to specific SAs. + +```yaml +# RBAC for testing the conditional authorization rule. +# +# Apply: +# oc apply -f echo-service-rbac.yaml +# +# --- ClusterRoles --- +# Standard inference access: get llminferenceservices +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: llminferenceservices-reader +rules: + - apiGroups: ["serving.kserve.io"] + resources: ["llminferenceservices"] + verbs: ["get"] +--- +# Delegated inference access: post-delegate llminferenceservices/delegate +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: llminferenceservices-delegate +rules: + - apiGroups: ["serving.kserve.io"] + resources: ["llminferenceservices/delegate"] + verbs: ["post-delegate"] +--- +# --- RoleBindings for test-user (standard access only) --- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: test-user-inference-access + namespace: echo-service +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: llminferenceservices-reader +subjects: + - kind: ServiceAccount + name: test-user + namespace: echo-service +--- +# --- RoleBindings for test-user-delegate (standard + delegate access) --- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: test-user-delegate-inference-access + namespace: echo-service +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: llminferenceservices-reader +subjects: + - kind: ServiceAccount + name: test-user-delegate + namespace: echo-service +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: test-user-delegate-delegate-access + namespace: echo-service +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: llminferenceservices-delegate +subjects: + - kind: ServiceAccount + name: test-user-delegate + namespace: echo-service +``` + +#### Test commands + +```bash +# Gateway internal address (from dnsutils pod in default namespace) +GW=http://openshift-ai-inference-openshift-default.openshift-ingress.svc.cluster.local + +# --- Create tokens --- +TEST_USER_TOKEN=$(oc create token test-user -n echo-service) +TEST_USER_DELEGATE_TOKEN=$(oc create token test-user-delegate -n echo-service) + +# Scenario 1: SA → batch path (authn only, headers injected) +# Expected: 200, echo shows x-maas-user = system:serviceaccount:echo-service:test-user +oc exec -n default dnsutils -- curl -vk -s \ + -H "Authorization: Bearer $TEST_USER_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"hello":"world"}' \ + $GW/v1/batches | jq + +# Scenario 2: SA → inference path (standard SAR: get llminferenceservices) +# Expected: 200 (test-user has get llminferenceservices in echo-service) +oc exec -n default dnsutils -- curl -vk -s \ + -H "Authorization: Bearer $TEST_USER_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"hello":"world"}' \ + $GW/echo-service/echo-server/v1/chat/completions | jq + +# Scenario 3: Delegated — caller (test-user-delegate) has post-delegate, forwarded user (test-user) has get +# Expected: 200 (rule 1: test-user has get, rule 2: test-user-delegate has post-delegate) +oc exec -n default dnsutils -- curl -vk -s \ + -H "Authorization: Bearer $TEST_USER_DELEGATE_TOKEN" \ + -H "Content-Type: application/json" \ + -H "x-maas-user: system:serviceaccount:echo-service:test-user" \ + -H "x-maas-groups: system:serviceaccounts,system:serviceaccounts:echo-service,system:authenticated" \ + -d '{"hello":"world"}' \ + $GW/echo-service/echo-server/v1/chat/completions | jq + +# Scenario 4: SA → batch path with spoofed x-maas-user (batch path — authz skipped) +# Expected: 200, Authorino appends real identity after spoofed value +oc exec -n default dnsutils -- curl -vk -s \ + -H "Authorization: Bearer $TEST_USER_TOKEN" \ + -H "Content-Type: application/json" \ + -H "x-maas-user: spoofed-user" \ + -d '{"hello":"world"}' \ + $GW/v1/batches | jq + +# Scenario 5: Delegated — forwarded user has no RBAC (should fail) +# Expected: 403 (rule 1: nonexistent-user has no get access) +oc exec -n default dnsutils -- curl -vk -s \ + -H "Authorization: Bearer $TEST_USER_DELEGATE_TOKEN" \ + -H "Content-Type: application/json" \ + -H "x-maas-user: nonexistent-user" \ + -d '{"hello":"world"}' \ + $GW/echo-service/echo-server/v1/chat/completions | jq + +# Scenario 6: Delegated — caller lacks post-delegate (should fail) +# Expected: 403 (rule 2: test-user has no post-delegate llminferenceservices/delegate) +oc exec -n default dnsutils -- curl -vk -s \ + -H "Authorization: Bearer $TEST_USER_TOKEN" \ + -H "Content-Type: application/json" \ + -H "x-maas-user: system:serviceaccount:echo-service:test-user-delegate" \ + -H "x-maas-groups: system:serviceaccounts,system:serviceaccounts:echo-service,system:authenticated" \ + -d '{"hello":"world"}' \ + $GW/echo-service/echo-server/v1/chat/completions | jq +``` + + +#### What to verify + +**Scenario 1** (SA → batch path): Response should include injected headers. +Look for `x-maas-user: system:serviceaccount:echo-service:test-user` and `x-maas-groups` +with the SA's groups. No authorization check happens (batch path excluded). + +**Scenario 2** (SA → inference path, standard): Standard SAR fires — checks +`system:serviceaccount:echo-service:test-user` for `get llminferenceservices` resource +`echo-server` in namespace `echo-service`. Should succeed (RBAC granted). Response should +include flow control headers but NOT `x-maas-user`/`x-maas-groups`. + +**Scenario 3** (delegated, both checks pass): Two SARs fire. Rule 1 checks forwarded user +`test-user` for `get llminferenceservices` — passes. Rule 2 checks caller `test-user-delegate` +for `post-delegate llminferenceservices/delegate` — passes. Should succeed (200). + +**Scenario 4** (SA → batch path with spoofed header): Batch path is excluded from +authorization, so the spoofed header has no effect on authz. Authorino appends the real +identity after the spoofed value (upstream append bug). Echo shows both values. + +**Scenario 5** (delegated, forwarded user has no RBAC): Rule 1 checks forwarded user +`nonexistent-user` for `get llminferenceservices` — fails with 403. The forwarded user +must have base inference access. + +**Scenario 6** (delegated, caller lacks `post-delegate`): Rule 1 checks forwarded user +`test-user-delegate` for `get llminferenceservices` — passes. Rule 2 checks caller +`test-user` for `post-delegate llminferenceservices/delegate` — fails with 403 because +`test-user` is not a trusted delegator. This validates that header spoofing by a regular +user is blocked. diff --git a/internal/controller/resources/template/authpolicy_llm_isvc_userdefined.yaml b/internal/controller/resources/template/authpolicy_llm_isvc_userdefined.yaml index 2e509b0500..5d8b0ed7d6 100644 --- a/internal/controller/resources/template/authpolicy_llm_isvc_userdefined.yaml +++ b/internal/controller/resources/template/authpolicy_llm_isvc_userdefined.yaml @@ -25,11 +25,13 @@ spec: expression: "{{.ObjectiveExpression}}" authorization: inference-access: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" kubernetesSubjectAccessReview: user: - expression: auth.identity.user.username + expression: "'x-maas-user' in request.headers ? request.headers['x-maas-user'] : auth.identity.user.username" authorizationGroups: - expression: auth.identity.user.groups + expression: "'x-maas-user' in request.headers ? ('x-maas-groups' in request.headers ? request.headers['x-maas-groups'].split(',') : []) : auth.identity.user.groups" resourceAttributes: group: value: serving.kserve.io @@ -42,6 +44,27 @@ spec: verb: value: get priority: 1 + inference-access-delegate: + when: + - predicate: "!(request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/'))" + - predicate: "'x-maas-user' in request.headers" + kubernetesSubjectAccessReview: + user: + expression: "auth.identity.user.username" + authorizationGroups: + expression: "auth.identity.user.groups" + resourceAttributes: + group: + value: serving.kserve.io + resource: + value: llminferenceservices/delegate + namespace: + expression: request.path.split("/")[1] + name: + expression: request.path.split("/")[2] + verb: + value: post-delegate + priority: 1 response: success: headers: @@ -55,3 +78,15 @@ spec: plain: expression: auth.identity.objective priority: 0 + x-maas-user: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: auth.identity.user.username + priority: 0 + x-maas-groups: + when: + - predicate: "request.path == '/v1/files' || request.path.startsWith('/v1/files/') || request.path == '/v1/batches' || request.path.startsWith('/v1/batches/')" + plain: + expression: "auth.identity.user.groups.join(',')" + priority: 0 diff --git a/internal/controller/test/e2e/batch_authpolicy_test.go b/internal/controller/test/e2e/batch_authpolicy_test.go new file mode 100644 index 0000000000..aa2a3e4c94 --- /dev/null +++ b/internal/controller/test/e2e/batch_authpolicy_test.go @@ -0,0 +1,405 @@ +//go:build e2e + +package e2e + +import ( + "fmt" + "net/http" + "strings" + "testing" + + rbacv1 "k8s.io/api/rbac/v1" +) + +const ( + saTestUser = "test-user" + saTestDelegate = "test-user-delegate" +) + +// testFixture holds the shared per-test resources. +type testFixture struct { + ns string + testUserToken string + testDelegateToken string +} + +// setupFixture creates a namespace with an echo server, inference HTTPRoute, +// ServiceAccounts, and RBAC bindings needed by the batch AuthPolicy tests. +// +// Shared batch HTTPRoutes (/v1/batches, /v1/files) are created once in TestMain +// via batchEnv.setupSharedBatchRoutes(). +// +// The inference HTTPRoute (/{ns}/echo-server/...) is unique per namespace. +func setupFixture(t *testing.T) *testFixture { + t.Helper() + + ns := batchEnv.createNamespace(t, "e2e-batch", nil) + batchEnv.deployEchoServer(t, ns) + batchEnv.createHTTPRoute(t, ns, "echo-inference", fmt.Sprintf("/%s/echo-server", ns)) + + batchEnv.createServiceAccount(t, ns, saTestUser) + batchEnv.createServiceAccount(t, ns, saTestDelegate) + batchEnv.grantInferenceAccess(t, ns, ns, saTestUser) + batchEnv.grantInferenceAccess(t, ns, ns, saTestDelegate) + batchEnv.grantDelegateAccess(t, ns, ns, saTestDelegate) + + testUserToken := batchEnv.requestToken(t, ns, saTestUser) + testDelegateToken := batchEnv.requestToken(t, ns, saTestDelegate) + + batchEnv.waitForGatewayRoute(t, fmt.Sprintf("/%s/echo-server/test", ns), testUserToken) + t.Log("fixture setup complete") + + return &testFixture{ + ns: ns, + testUserToken: testUserToken, + testDelegateToken: testDelegateToken, + } +} + +// saIdentity returns the full ServiceAccount identity string. +func saIdentity(ns, name string) string { + return fmt.Sprintf("system:serviceaccount:%s:%s", ns, name) +} + +// TestBatchPathAuthnOnly verifies that a request to a batch path (/v1/batches) +// with a valid token succeeds (200) and receives the x-maas-user header injected +// by Authorino with the caller's ServiceAccount identity. +// +// Scenario 1 from BATCH.md: batch paths skip authorization, authn-only. +func TestBatchPathAuthnOnly(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + resp, body := batchEnv.gatewayGet(t, "/v1/batches", f.testUserToken, nil) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + // The echo server reflects request headers in the response body. + // Verify the x-maas-user header was injected by Authorino. + expectedUser := saIdentity(f.ns, saTestUser) + if !strings.Contains(string(body), expectedUser) { + t.Errorf("response body does not contain expected x-maas-user %q", expectedUser) + } +} + +// TestFilesPathAuthnOnly verifies that a request to the /v1/files batch path +// with a valid token succeeds (200) and receives the x-maas-user header injected +// by Authorino. This mirrors TestBatchPathAuthnOnly but for the /v1/files prefix. +func TestFilesPathAuthnOnly(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + resp, body := batchEnv.gatewayGet(t, "/v1/files", f.testUserToken, nil) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + expectedUser := saIdentity(f.ns, saTestUser) + if !strings.Contains(string(body), expectedUser) { + t.Errorf("response body does not contain expected x-maas-user %q", expectedUser) + } +} + +// TestFilesPathSpoofedHeader verifies that a request to /v1/files with a +// spoofed x-maas-user header still succeeds (200) because batch paths skip +// authorization. Mirrors TestBatchPathSpoofedHeader for the /v1/files prefix. +func TestFilesPathSpoofedHeader(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + headers := map[string]string{ + "x-maas-user": "spoofed-user", + } + resp, _ := batchEnv.gatewayGet(t, "/v1/files", f.testUserToken, headers) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } +} + +// TestNoTokenReturns401 verifies that a request without an Authorization header +// is rejected with 401 by the authentication layer. +func TestNoTokenReturns401(t *testing.T) { + t.Parallel() + + resp, _ := batchEnv.gatewayGet(t, "/v1/batches", "", nil) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401, got %d", resp.StatusCode) + } +} + +// TestInferencePathNoRBAC verifies that a request to an inference path +// with a valid token but no inference RBAC (get llminferenceservices) is +// rejected with 403. +func TestInferencePathNoRBAC(t *testing.T) { + t.Parallel() + + ns := batchEnv.createNamespace(t, "e2e-batch", nil) + batchEnv.deployEchoServer(t, ns) + batchEnv.createHTTPRoute(t, ns, "echo-inference", fmt.Sprintf("/%s/echo-server", ns)) + + // Create SA with no RBAC at all. + batchEnv.createServiceAccount(t, ns, "no-rbac-user") + token := batchEnv.requestToken(t, ns, "no-rbac-user") + + batchEnv.waitForGatewayRoute(t, fmt.Sprintf("/%s/echo-server/test", ns), token) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", ns) + resp, _ := batchEnv.gatewayGet(t, path, token, nil) + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403, got %d", resp.StatusCode) + } +} + +// TestInferencePathStandardSAR verifies that a request to an inference path +// with a token for a SA that has `get llminferenceservices` succeeds (200). +// +// Scenario 2 from BATCH.md: standard SAR on inference path. +func TestInferencePathStandardSAR(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + resp, _ := batchEnv.gatewayGet(t, path, f.testUserToken, nil) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } +} + +// TestInferencePathDelegatedSAR verifies that a delegated request succeeds when +// the caller has `post-delegate llminferenceservices/delegate` (rule 2) and the +// forwarded user has `get llminferenceservices` (rule 1). +// +// Scenario 3 from BATCH.md: caller has post-delegate, forwarded user has get. +func TestInferencePathDelegatedSAR(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + headers := map[string]string{ + "x-maas-user": saIdentity(f.ns, saTestUser), + } + // Caller is test-user-delegate (has post-delegate), forwarded user is test-user (has get). + resp, _ := batchEnv.gatewayGet(t, path, f.testDelegateToken, headers) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } +} + +// TestBatchPathSpoofedHeader verifies that a request to a batch path with a +// spoofed x-maas-user header still succeeds (200) because batch paths skip +// authorization entirely. +// +// Scenario 4 from BATCH.md: batch path skips authz, spoofed header ignored. +func TestBatchPathSpoofedHeader(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + headers := map[string]string{ + "x-maas-user": "spoofed-user", + } + resp, _ := batchEnv.gatewayGet(t, "/v1/batches", f.testUserToken, headers) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } +} + +// TestInferencePathSpoofedNoRBAC verifies that a delegated request fails with +// 403 when the forwarded user has no RBAC (rule 1 fails: no get access). +// The caller has post-delegate (rule 2 would pass), isolating the rule 1 failure. +// +// Scenario 5 from BATCH.md: forwarded user has no RBAC. +func TestInferencePathSpoofedNoRBAC(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + headers := map[string]string{ + "x-maas-user": "nonexistent-user", + } + // Caller is test-user-delegate (has post-delegate), forwarded user has no RBAC. + resp, _ := batchEnv.gatewayGet(t, path, f.testDelegateToken, headers) + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403, got %d", resp.StatusCode) + } +} + +// TestInferencePathDelegatedNoDelegate verifies that a delegated request fails +// with 403 when the caller lacks `post-delegate llminferenceservices/delegate` +// (rule 2 fails). The forwarded user has get access (rule 1 would pass), +// isolating the rule 2 failure. This validates that header spoofing by a +// regular user is blocked. +// +// Scenario 6 from BATCH.md: caller lacks post-delegate. +func TestInferencePathDelegatedNoDelegate(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + headers := map[string]string{ + "x-maas-user": saIdentity(f.ns, saTestDelegate), + } + // Caller is test-user (no post-delegate), forwarded user is test-user-delegate (has get). + resp, _ := batchEnv.gatewayGet(t, path, f.testUserToken, headers) + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403, got %d", resp.StatusCode) + } +} + +// TestInferencePathDelegateVerbWrongResource verifies that a caller with +// post-delegate on llminferenceservices (without the /delegate subresource) +// does NOT pass rule 2. The SAR requires post-delegate on +// llminferenceservices/delegate specifically. +func TestInferencePathDelegateVerbWrongResource(t *testing.T) { + t.Parallel() + + ns := batchEnv.createNamespace(t, "e2e-batch", nil) + batchEnv.deployEchoServer(t, ns) + batchEnv.createHTTPRoute(t, ns, "echo-inference", fmt.Sprintf("/%s/echo-server", ns)) + + // Create caller SA with post-delegate on llminferenceservices (wrong resource — + // should be llminferenceservices/delegate). + const saCaller = "wrong-resource-caller" + batchEnv.createServiceAccount(t, ns, saCaller) + batchEnv.grantInferenceAccess(t, ns, ns, saCaller) + batchEnv.grantAccess(t, ns, ns, saCaller, saCaller+"-wrong-resource", []rbacv1.PolicyRule{{ + APIGroups: []string{"serving.kserve.io"}, + Resources: []string{"llminferenceservices"}, + Verbs: []string{"post-delegate"}, + }}) + callerToken := batchEnv.requestToken(t, ns, saCaller) + + // Create forwarded user SA with standard inference access (rule 1 passes). + const saForwarded = "forwarded-user" + batchEnv.createServiceAccount(t, ns, saForwarded) + batchEnv.grantInferenceAccess(t, ns, ns, saForwarded) + + batchEnv.waitForGatewayRoute(t, fmt.Sprintf("/%s/echo-server/test", ns), callerToken) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", ns) + headers := map[string]string{ + "x-maas-user": saIdentity(ns, saForwarded), + } + // Rule 1: forwarded-user has get → pass. Rule 2: caller has post-delegate on + // wrong resource (llminferenceservices, not llminferenceservices/delegate) → fail. + resp, _ := batchEnv.gatewayGet(t, path, callerToken, headers) + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403, got %d", resp.StatusCode) + } +} + +// TestInferencePathNoToken verifies that a request to an inference path +// without an Authorization header is rejected with 401. +func TestInferencePathNoToken(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + resp, _ := batchEnv.gatewayGet(t, path, "", nil) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401, got %d", resp.StatusCode) + } +} + +// TestDelegatedForwardedUserDelegateOnlyNoGet verifies that a delegated request +// fails when the forwarded user has post-delegate on llminferenceservices/delegate +// but lacks get on llminferenceservices. Rule 1 checks the forwarded user for get +// and should reject. +func TestDelegatedForwardedUserDelegateOnlyNoGet(t *testing.T) { + t.Parallel() + + ns := batchEnv.createNamespace(t, "e2e-batch", nil) + batchEnv.deployEchoServer(t, ns) + batchEnv.createHTTPRoute(t, ns, "echo-inference", fmt.Sprintf("/%s/echo-server", ns)) + + // Caller SA with both get and post-delegate (rule 2 passes). + const saCaller = "delegate-caller" + batchEnv.createServiceAccount(t, ns, saCaller) + batchEnv.grantInferenceAccess(t, ns, ns, saCaller) + batchEnv.grantDelegateAccess(t, ns, ns, saCaller) + callerToken := batchEnv.requestToken(t, ns, saCaller) + + // Forwarded user SA with post-delegate but NO get (rule 1 fails). + const saForwarded = "delegate-only-user" + batchEnv.createServiceAccount(t, ns, saForwarded) + batchEnv.grantDelegateAccess(t, ns, ns, saForwarded) + + batchEnv.waitForGatewayRoute(t, fmt.Sprintf("/%s/echo-server/test", ns), callerToken) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", ns) + headers := map[string]string{ + "x-maas-user": saIdentity(ns, saForwarded), + } + resp, _ := batchEnv.gatewayGet(t, path, callerToken, headers) + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("expected 403, got %d", resp.StatusCode) + } +} + +// TestBatchSubpathAuthnOnly verifies that deeper batch subpaths like +// /v1/batches/batch_123 and /v1/files/file_abc/content also skip authorization +// (the when predicate uses startsWith). +func TestBatchSubpathAuthnOnly(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + subpaths := []string{ + "/v1/batches/batch_123", + "/v1/batches/batch_456/cancel", + "/v1/files/file_abc", + "/v1/files/file_abc/content", + } + for _, sp := range subpaths { + t.Run(sp, func(t *testing.T) { + resp, _ := batchEnv.gatewayGet(t, sp, f.testUserToken, nil) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + }) + } +} + +// TestDelegatedSelfDelegation verifies that a caller setting x-maas-user to +// their own identity requires both get (rule 1) and post-delegate (rule 2). +// test-user-delegate has both, so this should succeed. +func TestDelegatedSelfDelegation(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + headers := map[string]string{ + "x-maas-user": saIdentity(f.ns, saTestDelegate), + } + // Caller is test-user-delegate, forwarded user is also test-user-delegate. + // Rule 1: forwarded user (test-user-delegate) has get → pass. + // Rule 2: caller (test-user-delegate) has post-delegate → pass. + resp, _ := batchEnv.gatewayGet(t, path, f.testDelegateToken, headers) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } +} + +// TestInferencePathNoBatchHeaders verifies that inference path responses do NOT +// include x-maas-user or x-maas-groups headers. These headers are only injected +// for batch paths (scoped by when predicates in the response section). +func TestInferencePathNoBatchHeaders(t *testing.T) { + t.Parallel() + f := setupFixture(t) + + path := fmt.Sprintf("/%s/echo-server/v1/chat/completions", f.ns) + resp, body := batchEnv.gatewayGet(t, path, f.testUserToken, nil) + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.StatusCode) + } + + // The echo server reflects all request headers in the response body. + // x-maas-user and x-maas-groups should NOT appear because the response + // header injection is scoped to batch paths only. + bodyStr := string(body) + if strings.Contains(bodyStr, "x-maas-user") { + t.Errorf("inference path response should not contain x-maas-user header") + } + if strings.Contains(bodyStr, "x-maas-groups") { + t.Errorf("inference path response should not contain x-maas-groups header") + } +} diff --git a/internal/controller/test/e2e/batch_authpolicy_test_env.go b/internal/controller/test/e2e/batch_authpolicy_test_env.go new file mode 100644 index 0000000000..7b9e810bec --- /dev/null +++ b/internal/controller/test/e2e/batch_authpolicy_test_env.go @@ -0,0 +1,941 @@ +//go:build e2e + +package e2e + +import ( + "context" + "crypto/tls" + "fmt" + "io" + "log" + "net" + "net/http" + "os" + "strings" + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + authenticationv1 "k8s.io/api/authentication/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8slabels "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/portforward" + "k8s.io/client-go/transport/spdy" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" + + "github.com/opendatahub-io/odh-model-controller/internal/controller/constants" +) + +const ( + pollInterval = time.Second + pollTimeout = 60 * time.Second + requestRetries = 3 + retryDelay = 2 * time.Second + + maxResponseBodyBytes = 1 << 20 // 1 MiB + + defaultEchoServerImage = "docker.io/ealen/echo-server:0.9.2@sha256:006b92e1d6682442e29d1d0d73c34a61166d42f8a5bf1ea10c318f07ad4790e2" +) + +// batchTestEnv holds shared clients and configuration for e2e tests. +type batchTestEnv struct { + gatewayURL string + gatewayName string + gatewayNamespace string + echoServerImage string + clientset *kubernetes.Clientset + k8sClient client.Client + httpClient *http.Client + portForwardStop chan struct{} + // sharedBatchNS is the namespace holding shared batch HTTPRoutes (/v1/batches, /v1/files). + // Created during init, cleaned up in close(). + sharedBatchNS string +} + +// close cleans up resources held by batchTestEnv, such as port-forward tunnels +// and shared namespaces. +func (e *batchTestEnv) close() { + if e.sharedBatchNS != "" { + log.Printf("deleting shared batch namespace %s", e.sharedBatchNS) + _ = e.clientset.CoreV1().Namespaces().Delete(context.Background(), e.sharedBatchNS, metav1.DeleteOptions{}) + } + if e.portForwardStop != nil { + log.Println("stopping gateway port-forward tunnel") + close(e.portForwardStop) + } +} + +// newTestEnv reads environment variables, creates Kubernetes clients, discovers +// the gateway URL from the Gateway resource status, and returns a ready-to-use batchTestEnv. +// If the gateway address is cluster-internal, a port-forward tunnel is set up +// automatically. Call close() when done to release resources. +func newTestEnv() (*batchTestEnv, error) { + log.Println("initializing e2e test environment") + + gwName := os.Getenv("GATEWAY_NAME") + if gwName == "" { + gwName = constants.DefaultGatewayName + } + + gwNamespace := os.Getenv("GATEWAY_NAMESPACE") + if gwNamespace == "" { + gwNamespace = constants.DefaultGatewayNamespace + } + + echoImage := os.Getenv("ECHO_SERVER_IMAGE") + if echoImage == "" { + echoImage = defaultEchoServerImage + } + + log.Printf("gateway: %s/%s, echo image: %s", gwNamespace, gwName, echoImage) + + log.Println("creating kubernetes clients...") + cfg := ctrl.GetConfigOrDie() + + clientset, err := kubernetes.NewForConfig(cfg) + if err != nil { + return nil, fmt.Errorf("create kubernetes clientset: %w", err) + } + + scheme := runtime.NewScheme() + utilruntime.Must(gatewayapiv1.Install(scheme)) + utilruntime.Must(corev1.AddToScheme(scheme)) + utilruntime.Must(appsv1.AddToScheme(scheme)) + + k8sClient, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("create controller-runtime client: %w", err) + } + log.Println("kubernetes clients ready") + + httpClient := &http.Client{ + Timeout: 10 * time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + MinVersion: tls.VersionTLS12, + InsecureSkipVerify: true, //nolint:gosec // e2e tests use self-signed certs + }, + }, + } + + // Discover gateway address from the Gateway resource status. + log.Printf("discovering gateway address from %s/%s status...", gwNamespace, gwName) + gw := &gatewayapiv1.Gateway{} + if err := k8sClient.Get(context.Background(), client.ObjectKey{ + Name: gwName, + Namespace: gwNamespace, + }, gw); err != nil { + return nil, fmt.Errorf("get gateway %s/%s: %w", gwNamespace, gwName, err) + } + + if len(gw.Status.Addresses) == 0 { + return nil, fmt.Errorf("gateway %s/%s has no addresses in status", gwNamespace, gwName) + } + + gwAddr := gw.Status.Addresses[0].Value + gwPort := gatewayListenerPort(gw) + log.Printf("gateway address: %s, listener port: %d", gwAddr, gwPort) + + var gatewayURL string + var portForwardStop chan struct{} + + if isInternalAddress(gwAddr) { + log.Printf("gateway address %s is cluster-internal, setting up port-forward...", gwAddr) + localPort, stopCh, pfErr := portForwardToGateway(cfg, clientset, gwNamespace, gwAddr, gwPort) + if pfErr != nil { + return nil, fmt.Errorf("gateway address %s is cluster-internal, port-forward failed: %w", gwAddr, pfErr) + } + portForwardStop = stopCh + gatewayURL = fmt.Sprintf("http://localhost:%d", localPort) + log.Printf("port-forward established: %s -> localhost:%d", gwAddr, localPort) + } else { + log.Printf("gateway address %s is externally reachable, using directly", gwAddr) + if gwPort == 80 { + gatewayURL = "http://" + gwAddr + } else { + gatewayURL = fmt.Sprintf("http://%s", net.JoinHostPort(gwAddr, fmt.Sprintf("%d", gwPort))) + } + } + + log.Printf("e2e environment ready, gateway URL: %s", gatewayURL) + + env := &batchTestEnv{ + gatewayURL: gatewayURL, + gatewayName: gwName, + gatewayNamespace: gwNamespace, + echoServerImage: echoImage, + clientset: clientset, + k8sClient: k8sClient, + httpClient: httpClient, + portForwardStop: portForwardStop, + } + + if err := env.setupSharedBatchRoutes(); err != nil { + env.close() + return nil, fmt.Errorf("setup shared batch routes: %w", err) + } + + return env, nil +} + +// setupSharedBatchRoutes creates the shared namespace, echo server, and +// HTTPRoutes for /v1/batches and /v1/files. Called once during env init +// (not bound to any test's lifecycle). +func (e *batchTestEnv) setupSharedBatchRoutes() error { + log.Println("setting up shared batch routes...") + + ctx := context.Background() + + // Create namespace. + ns, err := e.clientset.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "e2e-batch-shared-", + Labels: map[string]string{"batchTestEnv": "odh-test"}, + }, + }, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("create shared batch namespace: %w", err) + } + e.sharedBatchNS = ns.Name + log.Printf("created shared batch namespace %s", ns.Name) + + // Deploy echo server (inline — can't use t-based helpers here). + labels := map[string]string{"app.kubernetes.io/name": "echo-server"} + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "echo-server", Namespace: ns.Name, Labels: labels}, + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.To[int32](1), + Selector: &metav1.LabelSelector{MatchLabels: labels}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "echo-server", + Image: e.echoServerImage, + Ports: []corev1.ContainerPort{{ContainerPort: 8080, Protocol: corev1.ProtocolTCP}}, + Env: []corev1.EnvVar{{Name: "PORT", Value: "8080"}}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("200m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }}, + }, + }, + }, + } + if _, err := e.clientset.AppsV1().Deployments(ns.Name).Create(ctx, dep, metav1.CreateOptions{}); err != nil { + return fmt.Errorf("create echo-server deployment: %w", err) + } + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "echo-server", Namespace: ns.Name}, + Spec: corev1.ServiceSpec{ + Selector: labels, + Ports: []corev1.ServicePort{{Port: 80, TargetPort: intstr.FromInt32(8080), Protocol: corev1.ProtocolTCP}}, + }, + } + if _, err := e.clientset.CoreV1().Services(ns.Name).Create(ctx, svc, metav1.CreateOptions{}); err != nil { + return fmt.Errorf("create echo-server service: %w", err) + } + + log.Printf("waiting for echo server in %s to become ready...", ns.Name) + if err := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, + func(ctx context.Context) (bool, error) { + d, getErr := e.clientset.AppsV1().Deployments(ns.Name).Get(ctx, "echo-server", metav1.GetOptions{}) + if getErr != nil { + return false, nil + } + return d.Status.ReadyReplicas >= 1, nil + }); err != nil { + return fmt.Errorf("echo-server not ready in %s: %w", ns.Name, err) + } + + // Create HTTPRoutes for batch paths. + for _, r := range []struct{ name, path string }{ + {"batch-routes-batches", "/v1/batches"}, + {"batch-routes-files", "/v1/files"}, + } { + if err := e.createHTTPRouteRaw(ctx, ns.Name, r.name, r.path); err != nil { + return err + } + } + + // Verify route reachability with a temporary token. + sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: "route-checker", Namespace: ns.Name}} + if _, err := e.clientset.CoreV1().ServiceAccounts(ns.Name).Create(ctx, sa, metav1.CreateOptions{}); err != nil { + return fmt.Errorf("create route-checker SA: %w", err) + } + expSeconds := int64(3600) + tokenResult, err := e.clientset.CoreV1().ServiceAccounts(ns.Name).CreateToken(ctx, "route-checker", + &authenticationv1.TokenRequest{Spec: authenticationv1.TokenRequestSpec{ExpirationSeconds: &expSeconds}}, + metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("request route-checker token: %w", err) + } + + log.Println("waiting for batch routes to become reachable...") + if err := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, + func(ctx context.Context) (bool, error) { + resp, _, reqErr := e.gatewayDo("/v1/batches", tokenResult.Status.Token, nil) + if reqErr != nil { + return false, nil + } + return resp.StatusCode != http.StatusBadGateway && resp.StatusCode != http.StatusServiceUnavailable, nil + }); err != nil { + return fmt.Errorf("batch routes not reachable: %w", err) + } + + log.Println("shared batch routes ready") + return nil +} + +// createHTTPRouteRaw creates an HTTPRoute without test helpers (for use in init). +func (e *batchTestEnv) createHTTPRouteRaw(ctx context.Context, ns, name, pathPrefix string) error { + log.Printf("creating HTTPRoute %s/%s for path prefix %s", ns, name, pathPrefix) + gwNS := gatewayapiv1.Namespace(e.gatewayNamespace) + route := &gatewayapiv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: gatewayapiv1.HTTPRouteSpec{ + CommonRouteSpec: gatewayapiv1.CommonRouteSpec{ + ParentRefs: []gatewayapiv1.ParentReference{{ + Name: gatewayapiv1.ObjectName(e.gatewayName), + Namespace: &gwNS, + }}, + }, + Rules: []gatewayapiv1.HTTPRouteRule{{ + Matches: []gatewayapiv1.HTTPRouteMatch{{ + Path: &gatewayapiv1.HTTPPathMatch{ + Type: ptr.To(gatewayapiv1.PathMatchPathPrefix), + Value: ptr.To(pathPrefix), + }, + }}, + BackendRefs: []gatewayapiv1.HTTPBackendRef{{ + BackendRef: gatewayapiv1.BackendRef{ + BackendObjectReference: gatewayapiv1.BackendObjectReference{ + Name: "echo-server", + Port: ptr.To(gatewayapiv1.PortNumber(80)), + }, + }, + }}, + }}, + }, + } + if err := e.k8sClient.Create(ctx, route); err != nil { + return fmt.Errorf("create HTTPRoute %s/%s: %w", ns, name, err) + } + + // Wait for AuthPolicy enforcement. + log.Printf("waiting for AuthPolicy to be enforced on HTTPRoute %s/%s...", ns, name) + if err := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, + func(ctx context.Context) (bool, error) { + r := &gatewayapiv1.HTTPRoute{} + if getErr := e.k8sClient.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, r); getErr != nil { + return false, nil + } + for _, parent := range r.Status.RouteStatus.Parents { + if parent.ControllerName != "kuadrant.io/policy-controller" { + continue + } + for _, cond := range parent.Conditions { + if cond.Type == "kuadrant.io/AuthPolicyAffected" && cond.Status == metav1.ConditionTrue { + return true, nil + } + } + } + return false, nil + }); err != nil { + return fmt.Errorf("AuthPolicy not enforced on HTTPRoute %s/%s: %w", ns, name, err) + } + log.Printf("AuthPolicy enforced on HTTPRoute %s/%s", ns, name) + return nil +} + +// isInternalAddress returns true if the address is cluster-internal +// (a .svc.cluster.local hostname or a private IP address). +func isInternalAddress(addr string) bool { + if strings.HasSuffix(addr, ".svc.cluster.local") || strings.HasSuffix(addr, ".svc") { + return true + } + if ip := net.ParseIP(addr); ip != nil { + return ip.IsPrivate() || ip.IsLinkLocalUnicast() + } + return false +} + +// gatewayListenerPort returns the port of the first HTTP/HTTPS/TCP listener, +// defaulting to 80 if none is found. +func gatewayListenerPort(gw *gatewayapiv1.Gateway) int32 { + for _, l := range gw.Spec.Listeners { + switch l.Protocol { + case gatewayapiv1.HTTPProtocolType, gatewayapiv1.HTTPSProtocolType, gatewayapiv1.TCPProtocolType: + return int32(l.Port) + } + } + return 80 +} + +// portForwardToGateway sets up a port-forward tunnel to a pod backing the +// gateway. It returns the local port, a stop channel (close to tear down), +// and any error. +func portForwardToGateway(cfg *rest.Config, clientset *kubernetes.Clientset, gwNS, gwAddr string, gwPort int32) (uint16, chan struct{}, error) { + ctx := context.Background() + + // Find the Service backing the gateway. + log.Printf("looking up service for gateway address %s in namespace %s...", gwAddr, gwNS) + svc, err := findGatewayService(clientset, gwNS, gwAddr) + if err != nil { + return 0, nil, err + } + log.Printf("found gateway service %s/%s", svc.Namespace, svc.Name) + + // Resolve the target (container) port for port-forwarding. + remotePort := gwPort + for _, sp := range svc.Spec.Ports { + if sp.Port == gwPort && sp.TargetPort.IntValue() > 0 { + remotePort = int32(sp.TargetPort.IntValue()) + break + } + } + log.Printf("resolved target port: service port %d -> container port %d", gwPort, remotePort) + + // Find a running pod matching the service selector. + selector := k8slabels.SelectorFromSet(svc.Spec.Selector) + log.Printf("finding running pod with selector %s...", selector.String()) + podList, err := clientset.CoreV1().Pods(svc.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: selector.String(), + }) + if err != nil { + return 0, nil, fmt.Errorf("list pods for service %s/%s: %w", svc.Namespace, svc.Name, err) + } + + var podName string + for _, pod := range podList.Items { + if pod.Status.Phase == corev1.PodRunning { + podName = pod.Name + break + } + } + if podName == "" { + return 0, nil, fmt.Errorf("no running pod found for service %s/%s", svc.Namespace, svc.Name) + } + log.Printf("selected pod %s/%s for port-forward", svc.Namespace, podName) + + // Set up SPDY transport for port-forward. + log.Printf("starting port-forward to %s/%s:%d...", svc.Namespace, podName, remotePort) + transport, upgrader, err := spdy.RoundTripperFor(cfg) + if err != nil { + return 0, nil, fmt.Errorf("create SPDY round tripper: %w", err) + } + + req := clientset.CoreV1().RESTClient().Post(). + Resource("pods"). + Namespace(svc.Namespace). + Name(podName). + SubResource("portforward") + + dialer := spdy.NewDialer(upgrader, &http.Client{Transport: transport}, http.MethodPost, req.URL()) + + stopCh := make(chan struct{}, 1) + readyCh := make(chan struct{}) + + // Use port 0 to let the OS pick an available local port. + fw, err := portforward.New(dialer, []string{fmt.Sprintf("0:%d", remotePort)}, stopCh, readyCh, io.Discard, io.Discard) + if err != nil { + return 0, nil, fmt.Errorf("create port forwarder: %w", err) + } + + errCh := make(chan error, 1) + go func() { + errCh <- fw.ForwardPorts() + }() + + pfCtx, pfCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer pfCancel() + + select { + case <-readyCh: + case err := <-errCh: + close(stopCh) + return 0, nil, fmt.Errorf("port-forward failed: %w", err) + case <-pfCtx.Done(): + close(stopCh) + return 0, nil, fmt.Errorf("port-forward setup timed out") + } + + ports, err := fw.GetPorts() + if err != nil { + close(stopCh) + return 0, nil, fmt.Errorf("get forwarded ports: %w", err) + } + + log.Printf("port-forward ready: localhost:%d -> %s/%s:%d", ports[0].Local, svc.Namespace, podName, remotePort) + + return ports[0].Local, stopCh, nil +} + +// findGatewayService locates the Kubernetes Service backing the gateway. +// It first tries to parse a cluster-internal hostname, then falls back to +// matching by ClusterIP. +func findGatewayService(clientset *kubernetes.Clientset, gwNS, gwAddr string) (*corev1.Service, error) { + ctx := context.Background() + + // Strategy 1: Parse cluster hostname (e.g., "svc-name.namespace.svc.cluster.local"). + if idx := strings.Index(gwAddr, ".svc"); idx > 0 { + parts := strings.SplitN(gwAddr[:idx], ".", 2) + if len(parts) == 2 { + log.Printf("trying hostname lookup: service %s in namespace %s", parts[0], parts[1]) + svc, err := clientset.CoreV1().Services(parts[1]).Get(ctx, parts[0], metav1.GetOptions{}) + if err == nil { + log.Printf("found service via hostname: %s/%s", svc.Namespace, svc.Name) + return svc, nil + } + log.Printf("hostname lookup failed: %v, trying ClusterIP match...", err) + } + } + + // Strategy 2: Find Service by ClusterIP in the gateway namespace. + log.Printf("searching for service with ClusterIP %s in namespace %s...", gwAddr, gwNS) + svcList, err := clientset.CoreV1().Services(gwNS).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("list services in %s: %w", gwNS, err) + } + for i := range svcList.Items { + if svcList.Items[i].Spec.ClusterIP == gwAddr { + log.Printf("found service via ClusterIP in %s: %s", gwNS, svcList.Items[i].Name) + return &svcList.Items[i], nil + } + } + + return nil, fmt.Errorf("no service found for gateway address %s", gwAddr) +} + +// createNamespace creates a namespace with a generated name based on the given +// prefix. The namespace is deleted on test cleanup unless the test failed. +func (e *batchTestEnv) createNamespace(t *testing.T, prefix string, labels map[string]string) string { + t.Helper() + + merged := map[string]string{"batchTestEnv": "odh-test"} + for k, v := range labels { + merged[k] = v + } + + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: prefix + "-", + Labels: merged, + }, + } + created, err := e.clientset.CoreV1().Namespaces().Create(context.Background(), ns, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create namespace %s: %v", prefix, err) + } + name := created.Name + t.Logf("created namespace %s", name) + t.Cleanup(func() { + if t.Failed() { + t.Logf("keeping namespace %s for debugging (test failed)", name) + return + } + t.Logf("deleting namespace %s", name) + if delErr := e.clientset.CoreV1().Namespaces().Delete(context.Background(), name, metav1.DeleteOptions{}); delErr != nil { + t.Logf("warning: failed to delete namespace %s: %v", name, delErr) + } + }) + return name +} + +// createServiceAccount creates a ServiceAccount in the given namespace. +func (e *batchTestEnv) createServiceAccount(t *testing.T, ns, name string) { + t.Helper() + t.Logf("creating service account %s/%s", ns, name) + sa := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + } + _, err := e.clientset.CoreV1().ServiceAccounts(ns).Create(context.Background(), sa, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create service account %s/%s: %v", ns, name, err) + } + t.Cleanup(func() { + _ = e.clientset.CoreV1().ServiceAccounts(ns).Delete(context.Background(), name, metav1.DeleteOptions{}) + }) +} + +// requestToken creates a short-lived token for the given ServiceAccount. +func (e *batchTestEnv) requestToken(t *testing.T, ns, saName string) string { + t.Helper() + t.Logf("requesting token for %s/%s", ns, saName) + expSeconds := int64(3600) + result, err := e.clientset.CoreV1().ServiceAccounts(ns).CreateToken( + context.Background(), + saName, + &authenticationv1.TokenRequest{ + Spec: authenticationv1.TokenRequestSpec{ + ExpirationSeconds: &expSeconds, + }, + }, + metav1.CreateOptions{}, + ) + if err != nil { + t.Fatalf("request token for %s/%s: %v", ns, saName, err) + } + return result.Status.Token +} + +// grantInferenceAccess creates a Role and RoleBinding in targetNS that grants +// the ServiceAccount permission to get llminferenceservices. +func (e *batchTestEnv) grantInferenceAccess(t *testing.T, targetNS, saNamespace, saName string) { + t.Helper() + e.grantAccess(t, targetNS, saNamespace, saName, + saName+"-inference-access", + []rbacv1.PolicyRule{{ + APIGroups: []string{"serving.kserve.io"}, + Resources: []string{"llminferenceservices"}, + Verbs: []string{"get"}, + }}, + ) +} + +// grantDelegateAccess creates a Role and RoleBinding in targetNS that grants +// the ServiceAccount permission to post-delegate llminferenceservices/delegate. +func (e *batchTestEnv) grantDelegateAccess(t *testing.T, targetNS, saNamespace, saName string) { + t.Helper() + e.grantAccess(t, targetNS, saNamespace, saName, + saName+"-delegate-access", + []rbacv1.PolicyRule{{ + APIGroups: []string{"serving.kserve.io"}, + Resources: []string{"llminferenceservices/delegate"}, + Verbs: []string{"post-delegate"}, + }}, + ) +} + +func (e *batchTestEnv) grantAccess(t *testing.T, targetNS, saNamespace, saName, roleName string, rules []rbacv1.PolicyRule) { + t.Helper() + t.Logf("granting RBAC %s to %s/%s in namespace %s", roleName, saNamespace, saName, targetNS) + + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: roleName, + Namespace: targetNS, + }, + Rules: rules, + } + _, err := e.clientset.RbacV1().Roles(targetNS).Create(context.Background(), role, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create role %s/%s: %v", targetNS, roleName, err) + } + t.Cleanup(func() { + _ = e.clientset.RbacV1().Roles(targetNS).Delete(context.Background(), roleName, metav1.DeleteOptions{}) + }) + + bindingName := roleName + "-binding" + binding := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: bindingName, + Namespace: targetNS, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: roleName, + }, + Subjects: []rbacv1.Subject{{ + Kind: "ServiceAccount", + Name: saName, + Namespace: saNamespace, + }}, + } + _, err = e.clientset.RbacV1().RoleBindings(targetNS).Create(context.Background(), binding, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create role binding %s/%s: %v", targetNS, bindingName, err) + } + t.Cleanup(func() { + _ = e.clientset.RbacV1().RoleBindings(targetNS).Delete(context.Background(), bindingName, metav1.DeleteOptions{}) + }) +} + +// deployEchoServer creates an echo server Deployment and Service in the given +// namespace and waits for it to become ready. +func (e *batchTestEnv) deployEchoServer(t *testing.T, ns string) { + t.Helper() + t.Logf("deploying echo server in namespace %s (image: %s)", ns, e.echoServerImage) + + labels := map[string]string{"app.kubernetes.io/name": "echo-server"} + + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "echo-server", + Namespace: ns, + Labels: labels, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.To[int32](1), + Selector: &metav1.LabelSelector{MatchLabels: labels}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "echo-server", + Image: e.echoServerImage, + Ports: []corev1.ContainerPort{{ + ContainerPort: 8080, + Protocol: corev1.ProtocolTCP, + }}, + Env: []corev1.EnvVar{{ + Name: "PORT", + Value: "8080", + }}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("200m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }}, + }, + }, + }, + } + _, err := e.clientset.AppsV1().Deployments(ns).Create(context.Background(), dep, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create echo-server deployment in %s: %v", ns, err) + } + t.Cleanup(func() { + _ = e.clientset.AppsV1().Deployments(ns).Delete(context.Background(), "echo-server", metav1.DeleteOptions{}) + }) + + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "echo-server", + Namespace: ns, + }, + Spec: corev1.ServiceSpec{ + Selector: labels, + Ports: []corev1.ServicePort{{ + Port: 80, + TargetPort: intstr.FromInt32(8080), + Protocol: corev1.ProtocolTCP, + }}, + }, + } + _, err = e.clientset.CoreV1().Services(ns).Create(context.Background(), svc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create echo-server service in %s: %v", ns, err) + } + t.Cleanup(func() { + _ = e.clientset.CoreV1().Services(ns).Delete(context.Background(), "echo-server", metav1.DeleteOptions{}) + }) + + // Wait for deployment to be ready. + t.Logf("waiting for echo server deployment in %s to become ready...", ns) + err = wait.PollUntilContextTimeout(context.Background(), pollInterval, pollTimeout, true, + func(ctx context.Context) (bool, error) { + d, getErr := e.clientset.AppsV1().Deployments(ns).Get(ctx, "echo-server", metav1.GetOptions{}) + if getErr != nil { + return false, nil + } + return d.Status.ReadyReplicas >= 1, nil + }) + if err != nil { + t.Fatalf("echo-server deployment in %s not ready: %v", ns, err) + } + t.Logf("echo server in %s is ready", ns) +} + +// createHTTPRoute creates an HTTPRoute pointing to the echo-server Service. +func (e *batchTestEnv) createHTTPRoute(t *testing.T, ns, name, pathPrefix string) { + t.Helper() + t.Logf("creating HTTPRoute %s/%s for path prefix %s", ns, name, pathPrefix) + + gwNS := gatewayapiv1.Namespace(e.gatewayNamespace) + route := &gatewayapiv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + Spec: gatewayapiv1.HTTPRouteSpec{ + CommonRouteSpec: gatewayapiv1.CommonRouteSpec{ + ParentRefs: []gatewayapiv1.ParentReference{{ + Name: gatewayapiv1.ObjectName(e.gatewayName), + Namespace: &gwNS, + }}, + }, + Rules: []gatewayapiv1.HTTPRouteRule{{ + Matches: []gatewayapiv1.HTTPRouteMatch{{ + Path: &gatewayapiv1.HTTPPathMatch{ + Type: ptr.To(gatewayapiv1.PathMatchPathPrefix), + Value: ptr.To(pathPrefix), + }, + }}, + BackendRefs: []gatewayapiv1.HTTPBackendRef{{ + BackendRef: gatewayapiv1.BackendRef{ + BackendObjectReference: gatewayapiv1.BackendObjectReference{ + Name: "echo-server", + Port: ptr.To(gatewayapiv1.PortNumber(80)), + }, + }, + }}, + }}, + }, + } + if err := e.k8sClient.Create(context.Background(), route); err != nil { + t.Fatalf("create HTTPRoute %s/%s: %v", ns, name, err) + } + t.Cleanup(func() { + _ = e.k8sClient.Delete(context.Background(), route) + }) + + e.waitForAuthPolicyAffected(t, ns, name) +} + +// waitForAuthPolicyAffected polls the HTTPRoute status until the Kuadrant policy +// controller reports the route is affected by an AuthPolicy. This ensures the +// AuthPolicy ext-authz filter is active before tests send requests. +func (e *batchTestEnv) waitForAuthPolicyAffected(t *testing.T, ns, name string) { + t.Helper() + t.Logf("waiting for AuthPolicy to be enforced on HTTPRoute %s/%s...", ns, name) + + var lastRoute *gatewayapiv1.HTTPRoute + err := wait.PollUntilContextTimeout(context.Background(), pollInterval, pollTimeout, true, + func(ctx context.Context) (bool, error) { + lastRoute = &gatewayapiv1.HTTPRoute{} + if getErr := e.k8sClient.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, lastRoute); getErr != nil { + return false, nil + } + for _, parent := range lastRoute.Status.RouteStatus.Parents { + if parent.ControllerName != "kuadrant.io/policy-controller" { + continue + } + for _, cond := range parent.Conditions { + if cond.Type == "kuadrant.io/AuthPolicyAffected" && cond.Status == metav1.ConditionTrue { + return true, nil + } + } + } + return false, nil + }) + if err != nil { + var statusSummary string + if lastRoute != nil && len(lastRoute.Status.RouteStatus.Parents) > 0 { + for _, parent := range lastRoute.Status.RouteStatus.Parents { + statusSummary += fmt.Sprintf("\n controller=%s:", parent.ControllerName) + for _, cond := range parent.Conditions { + statusSummary += fmt.Sprintf("\n %s=%s (reason=%s, message=%q)", cond.Type, cond.Status, cond.Reason, cond.Message) + } + } + } else { + statusSummary = "\n no parent status reported" + } + t.Fatalf("timed out waiting for AuthPolicy to be enforced on HTTPRoute %s/%s; current parent status:%s", ns, name, statusSummary) + } + t.Logf("AuthPolicy enforced on HTTPRoute %s/%s", ns, name) +} + +// gatewayDo performs a single HTTP GET through the gateway. Returns the response, +// body, and any error. Callers are responsible for handling errors. +func (e *batchTestEnv) gatewayDo(path, token string, extraHeaders map[string]string) (*http.Response, []byte, error) { + url := e.gatewayURL + path + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, url, nil) + if err != nil { + return nil, nil, fmt.Errorf("create request: %w", err) + } + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + for k, v := range extraHeaders { + req.Header.Set(k, v) + } + + resp, err := e.httpClient.Do(req) + if err != nil { + return nil, nil, fmt.Errorf("GET %s: %w", url, err) + } + + defer func() { _ = resp.Body.Close() }() + body, err := io.ReadAll(io.LimitReader(resp.Body, maxResponseBodyBytes+1)) + if err != nil { + return nil, nil, fmt.Errorf("read response body: %w", err) + } + if len(body) > maxResponseBodyBytes { + return nil, nil, fmt.Errorf("response body exceeds %d bytes", maxResponseBodyBytes) + } + return resp, body, nil +} + +// gatewayGet performs an HTTP GET through the gateway with bearer token and +// optional extra headers. Retries on transient errors (502, 503, connection +// failures) up to requestRetries times. Returns the response and body bytes. +func (e *batchTestEnv) gatewayGet(t *testing.T, path, token string, extraHeaders map[string]string) (*http.Response, []byte) { + t.Helper() + + var lastResp *http.Response + var lastBody []byte + var lastErr error + + for attempt := range requestRetries { + lastResp, lastBody, lastErr = e.gatewayDo(path, token, extraHeaders) + if lastErr != nil { + t.Logf("GET %s attempt %d/%d failed: %v", e.gatewayURL+path, attempt+1, requestRetries, lastErr) + time.Sleep(retryDelay) + continue + } + + // Retry on transient gateway errors. + if lastResp.StatusCode == http.StatusBadGateway || lastResp.StatusCode == http.StatusServiceUnavailable { + t.Logf("GET %s attempt %d/%d returned %d, retrying", e.gatewayURL+path, attempt+1, requestRetries, lastResp.StatusCode) + time.Sleep(retryDelay) + continue + } + + return lastResp, lastBody + } + + if lastErr != nil { + t.Fatalf("GET %s failed after %d attempts: %v", e.gatewayURL+path, requestRetries, lastErr) + } + // All retries returned transient status codes — return the last response. + return lastResp, lastBody +} + +// waitForGatewayRoute polls the gateway until the given path returns a non-502/503 +// response, indicating the HTTPRoute has been accepted and the backend is reachable. +func (e *batchTestEnv) waitForGatewayRoute(t *testing.T, path, token string) { + t.Helper() + t.Logf("waiting for gateway route %s to become reachable...", path) + err := wait.PollUntilContextTimeout(context.Background(), pollInterval, pollTimeout, true, + func(ctx context.Context) (bool, error) { + resp, _, reqErr := e.gatewayDo(path, token, nil) + if reqErr != nil { + return false, nil // transient — keep polling + } + return resp.StatusCode != http.StatusBadGateway && resp.StatusCode != http.StatusServiceUnavailable, nil + }) + if err != nil { + t.Fatalf("timed out waiting for gateway route %s to become ready", path) + } + t.Logf("gateway route %s is reachable", path) +} diff --git a/internal/controller/test/e2e/suite_test.go b/internal/controller/test/e2e/suite_test.go new file mode 100644 index 0000000000..acc6cfc3bc --- /dev/null +++ b/internal/controller/test/e2e/suite_test.go @@ -0,0 +1,24 @@ +//go:build e2e + +package e2e + +import ( + "fmt" + "os" + "testing" +) + +var batchEnv *batchTestEnv + +func TestMain(m *testing.M) { + var err error + batchEnv, err = newTestEnv() + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to initialize e2e environment: %v\n", err) + os.Exit(1) + } + + code := m.Run() + batchEnv.close() + os.Exit(code) +}