diff --git a/.gitignore b/.gitignore index 18a475b5e7..7e9674135f 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,9 @@ site # MacOS generated files **/.DS_Store + +# Ignore all Chart.lock files anywhere under config/charts +config/charts/**/Chart.lock + +# Ignore all .tgz files anywhere under config/charts +config/charts/**/*.tgz diff --git a/config/charts/epp-standalone/.helmignore b/config/charts/epp-standalone/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/config/charts/epp-standalone/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/config/charts/epp-standalone/Chart.yaml b/config/charts/epp-standalone/Chart.yaml new file mode 100644 index 0000000000..07f84dc1f5 --- /dev/null +++ b/config/charts/epp-standalone/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +name: epp-standalone +description: A Helm chart for Endpoint Picker + +type: application + +version: 0.0.0 + +appVersion: "0.0.0" + +dependencies: + - name: inference-extension + version: 0.0.0 + repository: "file://../inference-extension" diff --git a/config/charts/epp-standalone/templates/epp-config.yaml b/config/charts/epp-standalone/templates/epp-config.yaml new file mode 100644 index 0000000000..c7295a5d54 --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-config.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.config" . -}} diff --git a/config/charts/epp-standalone/templates/epp-deployment.yaml b/config/charts/epp-standalone/templates/epp-deployment.yaml new file mode 100644 index 0000000000..4eaba71d28 --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-deployment.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.deployment" . -}} diff --git a/config/charts/epp-standalone/templates/epp-gke.yaml b/config/charts/epp-standalone/templates/epp-gke.yaml new file mode 100644 index 0000000000..ee54a8469c --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-gke.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.gke" . -}} diff --git a/config/charts/epp-standalone/templates/epp-leader-election-rbac.yaml b/config/charts/epp-standalone/templates/epp-leader-election-rbac.yaml new file mode 100644 index 0000000000..6820306788 --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-leader-election-rbac.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.lead-election-rbac" . -}} diff --git a/config/charts/epp-standalone/templates/epp-rbac.yaml b/config/charts/epp-standalone/templates/epp-rbac.yaml new file mode 100644 index 0000000000..6c99c9e56d --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-rbac.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.rbac" . -}} diff --git a/config/charts/epp-standalone/templates/epp-sa-token-secret.yaml b/config/charts/epp-standalone/templates/epp-sa-token-secret.yaml new file mode 100644 index 0000000000..ec13d9dce8 --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-sa-token-secret.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.sa-token-secret" . -}} diff --git a/config/charts/epp-standalone/templates/epp-service-monitor.yaml b/config/charts/epp-standalone/templates/epp-service-monitor.yaml new file mode 100644 index 0000000000..2e5f7a0d3e --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-service-monitor.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.service-monitor" . -}} diff --git a/config/charts/epp-standalone/templates/epp-service.yaml b/config/charts/epp-standalone/templates/epp-service.yaml new file mode 100644 index 0000000000..fb6ab40573 --- /dev/null +++ b/config/charts/epp-standalone/templates/epp-service.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.service" . -}} diff --git a/config/charts/epp-standalone/values.yaml b/config/charts/epp-standalone/values.yaml new file mode 100644 index 0000000000..3cb10dd5ec --- /dev/null +++ b/config/charts/epp-standalone/values.yaml @@ -0,0 +1,298 @@ +inferenceExtension: + replicas: 1 + image: + name: epp + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + tag: main + pullPolicy: Always + extProcPort: 9002 + extraServicePorts: + - name: http + port: 8081 + protocol: TCP + targetPort: 8081 + env: [] + pluginsConfigFile: "default-plugins.yaml" + + endpointsServer: + standalone: true + # Required when standalone is true + # endpointSelector: app=vllm-llama3-8b-instruct + targetPorts: 8000 + modelServerType: vllm # vllm, triton-tensorrt-llm + + + sidecar: + enabled: true + configMap: + name: envoy + # Because the template just dumps this section, the keys become filenames. + # The values MUST be strings (note the literal block scalar '|') + data: + envoy.yaml: | + admin: + address: + socket_address: + address: 127.0.0.1 + port_value: 19000 + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/null + static_resources: + listeners: + - name: envoy-proxy-ready-0.0.0.0-19001 + address: + socket_address: + address: 0.0.0.0 + port_value: 19001 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: envoy-ready-http + route_config: + name: local_route + virtual_hosts: + - name: prometheus_stats + domains: ["*"] + routes: + - match: + prefix: "/stats/prometheus" + route: + cluster: "prometheus_stats" + http_filters: + - name: envoy.filters.http.health_check + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.health_check.v3.HealthCheck + pass_through_mode: false + headers: + - name: ":path" + string_match: + exact: "/ready" + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + - name: vllm + address: + socket_address: + address: 0.0.0.0 + port_value: 8081 + per_connection_buffer_limit_bytes: 32768 + access_log: + - name: envoy.access_loggers.file + filter: + response_flag_filter: + flags: ["NR"] + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/stdout + log_format: + text_format_source: + inline_string: "{\"start_time\":\"%START_TIME%\",\"method\":\"%REQ(:METHOD)%\",...}\n" + filter_chains: + - name: vllm + filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: http-8081 + route_config: + name: vllm + virtual_hosts: + - name: vllm-default + domains: ["*"] + routes: + - match: + prefix: "/" + route: + cluster: original_destination_cluster + timeout: 86400s + idle_timeout: 86400s + upgrade_configs: + - upgrade_type: websocket + typed_per_filter_config: + envoy.filters.http.ext_proc: + "@type": type.googleapis.com/envoy.config.route.v3.FilterConfig + config: {} + http_filters: + - name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + grpc_service: + envoy_grpc: + cluster_name: ext_proc + authority: localhost:9002 + timeout: 10s + processing_mode: + request_header_mode: SEND + response_header_mode: SEND + request_body_mode: FULL_DUPLEX_STREAMED + response_body_mode: FULL_DUPLEX_STREAMED + request_trailer_mode: SEND + response_trailer_mode: SEND + message_timeout: 1000s + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + suppress_envoy_headers: true + http2_protocol_options: + max_concurrent_streams: 100 + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + use_remote_address: true + normalize_path: true + merge_slashes: true + server_header_transformation: PASS_THROUGH + common_http_protocol_options: + headers_with_underscores_action: REJECT_REQUEST + path_with_escaped_slashes_action: UNESCAPE_AND_REDIRECT + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/stdout + log_format: + text_format_source: + inline_string: "{\"start_time\":\"%START_TIME%\",\"method\":\"%REQ(:METHOD)%\",...}\n" + clusters: + - name: prometheus_stats + type: STATIC + connect_timeout: 0.250s + load_assignment: + cluster_name: prometheus_stats + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 19000 + - name: original_destination_cluster + type: ORIGINAL_DST + connect_timeout: 1000s + lb_policy: CLUSTER_PROVIDED + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + original_dst_lb_config: + use_http_header: true + http_header_name: x-gateway-destination-endpoint + - name: ext_proc + type: STATIC + connect_timeout: 86400s + lb_policy: LEAST_REQUEST + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + max_retries: 1024 + health_checks: + - timeout: 2s + interval: 10s + unhealthy_threshold: 3 + healthy_threshold: 2 + reuse_connection: true + grpc_health_check: + service_name: "envoy.service.ext_proc.v3.ExternalProcessor" + tls_options: + alpn_protocols: ["h2"] + transport_socket: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + common_tls_context: + validation_context: + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + load_assignment: + cluster_name: ext_proc + endpoints: + - locality: + region: ext_proc/e2e/0 + lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9002 + load_balancing_weight: 1 + name: envoy-sidecar + image: docker.io/envoyproxy/envoy:distroless-v1.33.2 + command: "envoy" + args: + - "--service-node" + - "envoy-sidecar" + - "--log-level" + - "trace" + - "--cpuset-threads" + - "--drain-strategy" + - "immediate" + - "--drain-time-s" + - "60" + - "-c" + - "/etc/envoy/envoy.yaml" + env: + - name: NS_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - containerPort: 8081 + name: http-8081 + - containerPort: 19001 + name: metrics-19001 + resources: + requests: + cpu: 100m + memory: 512Mi + + readinessProbe: + failureThreshold: 1 + httpGet: + path: /ready + port: 19001 + scheme: HTTP + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + + volumeMounts: + - name: config + mountPath: /etc/envoy + readOnly: true + volumes: + - name: config + configMap: + name: envoy + items: + - key: envoy.yaml + path: envoy.yaml + monitoring: + interval: "10s" + # Prometheus ServiceMonitor will be created when enabled for EPP metrics collection + prometheus: + enabled: false + auth: + # To allow unauthenticated /metrics access (e.g., for debugging with curl), set to false + enabled: true + + tracing: + enabled: false + + latencyPredictor: + enabled: false diff --git a/config/charts/inference-extension/.helmignore b/config/charts/inference-extension/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/config/charts/inference-extension/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/config/charts/inference-extension/Chart.yaml b/config/charts/inference-extension/Chart.yaml new file mode 100644 index 0000000000..f4a7359ba7 --- /dev/null +++ b/config/charts/inference-extension/Chart.yaml @@ -0,0 +1,10 @@ +apiVersion: v2 +name: inference-extension +description: A library Helm chart for Endpoint Picker + +type: library + +version: 0.0.0 + +appVersion: "0.0.0" + diff --git a/config/charts/inference-extension/templates/_config.yaml b/config/charts/inference-extension/templates/_config.yaml new file mode 100644 index 0000000000..797d810014 --- /dev/null +++ b/config/charts/inference-extension/templates/_config.yaml @@ -0,0 +1,91 @@ +{{- define "inference-extension.config" -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} +data: + default-plugins.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - type: predicted-latency-scorer + parameters: + {{- with .Values.inferenceExtension.latencyPredictor.sloAwareRouting | default dict }} + samplingMean: {{ .samplingMean | default 1000.0 }} + maxSampledTokens: {{ .maxSampledTokens | default 20 }} + sloBufferFactor: {{ .sloBufferFactor | default 1.0 }} + negHeadroomTTFTWeight: {{ .negHeadroomTTFTWeight | default 0.8 }} + negHeadroomTPOTWeight: {{ .negHeadroomTPOTWeight | default 0.2 }} + headroomTTFTWeight: {{ .headroomTTFTWeight | default 0.8 }} + headroomTPOTWeight: {{ .headroomTPOTWeight | default 0.2 }} + headroomSelectionStrategy: {{ .headroomSelectionStrategy | default "least" | quote }} + compositeKVWeight: {{ .compositeKVWeight | default 1.0 }} + compositeQueueWeight: {{ .compositeQueueWeight | default 1.0 }} + compositePrefixWeight: {{ .compositePrefixWeight | default 1.0 }} + epsilonExploreSticky: {{ .epsilonExploreSticky | default 0.01 }} + epsilonExploreNeg: {{ .epsilonExploreNeg | default 0.01 }} + affinityGateTau: {{ .affinityGateTau | default 0.80 }} + affinityGateTauGlobal: {{ .affinityGateTauGlobal | default 0.99 }} + selectionMode: {{ .selectionMode | default "linear" | quote }} + {{- end }} + {{- end }} + schedulingProfiles: + {{- if .Values.inferenceExtension.latencyPredictor.enabled }} + - name: default + plugins: + - pluginRef: predicted-latency-scorer + featureGates: + - prepareDataPlugins + {{- else }} + - name: default + plugins: + - pluginRef: queue-scorer + weight: 2 + - pluginRef: kv-cache-utilization-scorer + weight: 2 + - pluginRef: prefix-cache-scorer + weight: 3 + {{- end }} + {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }} + {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} + {{- end }} + +--- +{{- if and .Values.inferenceExtension.sidecar.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.inferenceExtension.sidecar.configMap.name }} + namespace: {{ .Release.Namespace }} +data: + {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }} +{{- end }} +--- +{{- if .Values.inferenceExtension.latencyPredictor.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training + namespace: {{ .Release.Namespace }} +data: + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }} + {{ $key }}: {{ $value | quote }} +{{- end }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-prediction + namespace: {{ .Release.Namespace }} +data: + {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.predictionServers.config }} + {{ $key }}: {{ $value | quote }} + {{- end }} +{{- end }} +--- +{{- end -}} diff --git a/config/charts/inference-extension/templates/_deployment.yaml b/config/charts/inference-extension/templates/_deployment.yaml new file mode 100644 index 0000000000..522c6ddf37 --- /dev/null +++ b/config/charts/inference-extension/templates/_deployment.yaml @@ -0,0 +1,223 @@ +{{- define "inference-extension.deployment" -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.inferenceExtension.replicas | default 1 }} + strategy: + # The current recommended EPP deployment pattern is to have a single active replica. This ensures + # optimal performance of the stateful operations such prefix cache aware scorer. + # The Recreate strategy the old replica is killed immediately, and allow the new replica(s) to + # quickly take over. This is particularly important in the high availability set up with leader + # election, as the rolling update strategy would prevent the old leader being killed because + # otherwise the maxUnavailable would be 100%. + type: Recreate + selector: + matchLabels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} + spec: + serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + {{- if .Values.inferenceExtension.sidecar.enabled }} + - name: {{ .Values.inferenceExtension.sidecar.name }} + image: {{ .Values.inferenceExtension.sidecar.image }} + imagePullPolicy: {{ .Values.inferenceExtension.sidecar.imagePullPolicy | default "IfNotPresent" }} + {{- with .Values.inferenceExtension.sidecar.command }} + command: + - {{ . | quote }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.args }} + args: + {{- range . }} + - {{ . | quote }} + {{- end }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.env }} + env: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.ports }} + ports: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.inferenceExtension.sidecar.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} + - name: epp + image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "IfNotPresent" }} + args: + {{- /* 1. Determine Model Server Type Logic */ -}} + {{- $modelServerType := "vllm" }} + {{- if and .Values.inferenceExtension.endpointsServer .Values.inferenceExtension.endpointsServer.standalone -}} + {{- $modelServerType = .Values.inferenceExtension.endpointsServer.modelServerType | default "vllm" }} + {{- else }} + {{- $modelServerType = .Values.inferencePool.modelServerType | default "vllm" }} + {{- end }} + {{- /* 2. Mode Specific Flags */ -}} + {{- if and .Values.inferenceExtension.endpointsServer .Values.inferenceExtension.endpointsServer.standalone }} + - --endpoint-selector + - {{ .Values.inferenceExtension.endpointsServer.endpointSelector | quote }} + - --endpoint-target-ports + - {{ .Values.inferenceExtension.endpointsServer.targetPorts | quote }} + {{- else }} + - --pool-name + - {{ .Release.Name }} + # The pool namespace is optional because EPP can default to the NAMESPACE env var. + - --pool-namespace + - {{ .Release.Namespace }} + {{- if ne .Values.inferencePool.apiVersion "inference.networking.k8s.io" }} + - --pool-group + - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" + {{- end }} + {{- end }} + {{- if eq $modelServerType "triton-tensorrt-llm" }} + - --total-queued-requests-metric + - "nv_trt_llm_request_metrics{request_type=waiting}" + - --kv-cache-usage-percentage-metric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" + - --lora-info-metric + - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. + {{- end }} + - --zap-encoder + - "json" + - --config-file + - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}" + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + - --ha-enable-leader-election + {{- end }} + # Pass additional flags via the inferenceExtension.flags field in values.yaml. + {{- range $key, $value := .Values.inferenceExtension.flags }} + - --{{ $key }} + - "{{ $value }}" + {{- end }} + {{- if .Values.inferenceExtension.tracing.enabled }} + - --tracing=true + {{- else }} + - --tracing=false + {{- end }} + {{- if not .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} + - --metrics-endpoint-auth=false + {{- end }} + ports: + - name: grpc + containerPort: 9002 + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: 9090 + {{- if .Values.inferenceExtension.extraContainerPorts }} + {{- toYaml .Values.inferenceExtension.extraContainerPorts | nindent 8 }} + {{- end }} + livenessProbe: + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + grpc: + port: 9003 + service: liveness + {{- else }} + grpc: + port: 9003 + service: inference-extension + {{- end }} + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + grpc: + port: 9003 + service: readiness + {{- else }} + grpc: + port: 9003 + service: inference-extension + {{- end }} + periodSeconds: 2 + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + {{- include "gateway-api-inference-extension.latencyPredictor.env" . | nindent 8 }} + {{- if .Values.inferenceExtension.tracing.enabled }} + - name: OTEL_SERVICE_NAME + value: "gateway-api-inference-extension" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .Values.inferenceExtension.tracing.otelExporterEndpoint | quote }} + - name: OTEL_TRACES_EXPORTER + value: "otlp" + - name: OTEL_RESOURCE_ATTRIBUTES_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: OTEL_RESOURCE_ATTRIBUTES + value: 'k8s.namespace.name=$(NAMESPACE),k8s.node.name=$(OTEL_RESOURCE_ATTRIBUTES_NODE_NAME),k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME)' + - name: OTEL_TRACES_SAMPLER + value: {{ .Values.inferenceExtension.tracing.sampling.sampler | quote }} + - name: OTEL_TRACES_SAMPLER_ARG + value: {{ .Values.inferenceExtension.tracing.sampling.samplerArg | quote }} + {{- end }} + {{- if .Values.inferenceExtension.env }} + {{- toYaml .Values.inferenceExtension.env | nindent 8 }} + {{- end }} + volumeMounts: + - name: plugins-config-volume + mountPath: "/config" + {{- if .Values.inferenceExtension.volumeMounts }} + {{- toYaml .Values.inferenceExtension.volumeMounts | nindent 8 }} + {{- end }} + {{- include "gateway-api-inference-extension.latencyPredictor.containers" . | nindent 6 }} + volumes: + {{- if .Values.inferenceExtension.volumes }} + {{- toYaml .Values.inferenceExtension.volumes | nindent 6 }} + {{- end }} + {{- if .Values.inferenceExtension.sidecar.volumes }} + {{- tpl (toYaml .Values.inferenceExtension.sidecar.volumes) $ | nindent 6 }} + {{- end }} + - name: plugins-config-volume + configMap: + name: {{ include "gateway-api-inference-extension.name" . }} + {{- include "gateway-api-inference-extension.latencyPredictor.volumes" . | nindent 6 }} + {{- if .Values.inferenceExtension.affinity }} + affinity: + {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }} + {{- end }} + {{- if .Values.inferenceExtension.tolerations }} + tolerations: + {{- toYaml .Values.inferenceExtension.tolerations | nindent 8 }} + {{- end }} +--- +{{- end }} diff --git a/config/charts/inference-extension/templates/_gke.yaml b/config/charts/inference-extension/templates/_gke.yaml new file mode 100644 index 0000000000..9a19e7597a --- /dev/null +++ b/config/charts/inference-extension/templates/_gke.yaml @@ -0,0 +1,108 @@ +{{- define "inference-extension.gke" -}} +{{- if eq (lower .Values.provider.name) "gke" }} +{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} +{{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}} +{{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}} +{{- $metricsReadRoleName := printf "%s-%s-metrics-reader" .Release.Namespace .Release.Name -}} +{{- $metricsReadRoleBindingName := printf "%s-%s-metrics-reader-role-binding" .Release.Namespace .Release.Name -}} +{{- $secretReadRoleName := printf "%s-metrics-reader-secret-read" .Release.Name -}} +{{- $gmpNamespace := "gmp-system" -}} +{{- $isAutopilot := false -}} +{{- with .Values.provider.gke }} + {{- $isAutopilot = .autopilot | default false -}} +{{- end }} +{{- if $isAutopilot -}} +{{- $gmpNamespace = "gke-gmp-system" -}} +{{- end -}} +{{- $gmpCollectorRoleBindingName := printf "%s:collector:%s-%s-metrics-reader-secret-read" $gmpNamespace .Release.Namespace .Release.Name -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $metricsReadSA }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ $metricsReadSecretName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} + annotations: + kubernetes.io/service-account.name: {{ $metricsReadSA }} +type: kubernetes.io/service-account-token +--- +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: {{ .Release.Name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +spec: + endpoints: + - port: metrics + scheme: http + interval: {{ .Values.inferenceExtension.monitoring.interval }} + path: /metrics + authorization: + type: Bearer + credentials: + secret: + name: {{ $metricsReadSecretName }} + key: token + selector: + matchLabels: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ $metricsReadRoleName }} +rules: + - nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ $metricsReadRoleBindingName }} +subjects: + - kind: ServiceAccount + name: {{ $metricsReadSA }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: {{ $metricsReadRoleName }} + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $secretReadRoleName }} +rules: + - resources: + - secrets + apiGroups: [""] + verbs: ["get", "list", "watch"] + resourceNames: [{{ $metricsReadSecretName | quote }}] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $gmpCollectorRoleBindingName }} + namespace: {{ .Release.Namespace }} +roleRef: + name: {{ $secretReadRoleName }} + kind: Role + apiGroup: rbac.authorization.k8s.io +subjects: + - name: collector + namespace: {{ $gmpNamespace }} + kind: ServiceAccount +--- +{{- end }} +{{- end }} +{{- end }} diff --git a/config/charts/inferencepool/templates/_helpers.tpl b/config/charts/inference-extension/templates/_helpers.tpl similarity index 72% rename from config/charts/inferencepool/templates/_helpers.tpl rename to config/charts/inference-extension/templates/_helpers.tpl index fdc9b1a2b7..c8e903fcc7 100644 --- a/config/charts/inferencepool/templates/_helpers.tpl +++ b/config/charts/inference-extension/templates/_helpers.tpl @@ -29,5 +29,12 @@ Cluster RBAC unique name Selector labels */}} {{- define "gateway-api-inference-extension.selectorLabels" -}} +{{- /* Check if endpointsServer exists AND if standalone is true */ -}} +{{- if and .Values.inferenceExtension.endpointsServer .Values.inferenceExtension.endpointsServer.standalone -}} +{{- /* LOGIC FOR STANDALONE EPP MODE */ -}} +epp: {{ include "gateway-api-inference-extension.name" . }} +{{- else -}} +{{- /* LOGIC FOR PARENT (INFERENCEPOOL) MODE */ -}} inferencepool: {{ include "gateway-api-inference-extension.name" . }} {{- end -}} +{{- end -}} diff --git a/config/charts/inferencepool/templates/_latency-predictor.tpl b/config/charts/inference-extension/templates/_latency-predictor.tpl similarity index 100% rename from config/charts/inferencepool/templates/_latency-predictor.tpl rename to config/charts/inference-extension/templates/_latency-predictor.tpl diff --git a/config/charts/inferencepool/templates/leader-election-rbac.yaml b/config/charts/inference-extension/templates/_leader-election-rbac.yaml similarity index 93% rename from config/charts/inferencepool/templates/leader-election-rbac.yaml rename to config/charts/inference-extension/templates/_leader-election-rbac.yaml index 11b3dd5168..4d44f2f5a2 100644 --- a/config/charts/inferencepool/templates/leader-election-rbac.yaml +++ b/config/charts/inference-extension/templates/_leader-election-rbac.yaml @@ -1,3 +1,4 @@ +{{- define "inference-extension.lead-election-rbac" -}} {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} --- kind: Role @@ -27,4 +28,6 @@ roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: {{ include "gateway-api-inference-extension.name" . }}-leader-election +--- +{{- end }} {{- end }} diff --git a/config/charts/inference-extension/templates/_rbac.yaml b/config/charts/inference-extension/templates/_rbac.yaml new file mode 100644 index 0000000000..0b77026048 --- /dev/null +++ b/config/charts/inference-extension/templates/_rbac.yaml @@ -0,0 +1,75 @@ +{{- define "inference-extension.rbac" -}} +{{- if .Values.inferenceExtension.monitoring.prometheus.enabled }} +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- nonResourceURLs: + - "/metrics" + verbs: + - get +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} +subjects: +- kind: ServiceAccount + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} +{{- end }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ printf "%s-sa" (include "gateway-api-inference-extension.name" .) }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ printf "%s-sa" (include "gateway-api-inference-extension.name" .) }} + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ printf "%s-sa" (include "gateway-api-inference-extension.name" .) }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +--- +{{- end }} diff --git a/config/charts/inference-extension/templates/_sa-token-secret.yaml b/config/charts/inference-extension/templates/_sa-token-secret.yaml new file mode 100644 index 0000000000..6c3a1ed74d --- /dev/null +++ b/config/charts/inference-extension/templates/_sa-token-secret.yaml @@ -0,0 +1,15 @@ +{{- define "inference-extension.sa-token-secret" -}} +{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled (ne (lower .Values.provider.name) "gke") }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} + annotations: + kubernetes.io/service-account.name: {{ include "gateway-api-inference-extension.name" . }} +type: kubernetes.io/service-account-token +--- +{{- end }} +{{- end }} diff --git a/config/charts/inference-extension/templates/_service.yaml b/config/charts/inference-extension/templates/_service.yaml new file mode 100644 index 0000000000..9bc5c47714 --- /dev/null +++ b/config/charts/inference-extension/templates/_service.yaml @@ -0,0 +1,24 @@ +{{- define "inference-extension.service" -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "gateway-api-inference-extension.name" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +spec: + selector: + {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} + ports: + - name: grpc-ext-proc + protocol: TCP + port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} + - name: http-metrics + protocol: TCP + port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + {{- with .Values.inferenceExtension.extraServicePorts }} + {{- . | toYaml | nindent 4 }} + {{- end }} + type: ClusterIP +--- +{{- end }} diff --git a/config/charts/inferencepool/templates/epp-servicemonitor.yaml b/config/charts/inference-extension/templates/_servicemonitor.yaml similarity index 94% rename from config/charts/inferencepool/templates/epp-servicemonitor.yaml rename to config/charts/inference-extension/templates/_servicemonitor.yaml index d58662cbef..17174a2938 100644 --- a/config/charts/inferencepool/templates/epp-servicemonitor.yaml +++ b/config/charts/inference-extension/templates/_servicemonitor.yaml @@ -1,3 +1,4 @@ +{{- define "inference-extension.service-monitor" -}} {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled (ne (lower .Values.provider.name) "gke") }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor @@ -27,4 +28,6 @@ spec: selector: matchLabels: {{- include "gateway-api-inference-extension.labels" . | nindent 6 }} +--- +{{- end }} {{- end }} diff --git a/config/charts/inferencepool/Chart.yaml b/config/charts/inferencepool/Chart.yaml index f98153c500..f6eadc3e21 100644 --- a/config/charts/inferencepool/Chart.yaml +++ b/config/charts/inferencepool/Chart.yaml @@ -7,3 +7,7 @@ type: application version: 0.0.0 appVersion: "0.0.0" +dependencies: + - name: inference-extension + version: 0.0.0 + repository: "file://../inference-extension" diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml index 6f947a9295..c7295a5d54 100644 --- a/config/charts/inferencepool/templates/epp-config.yaml +++ b/config/charts/inferencepool/templates/epp-config.yaml @@ -1,88 +1 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} -data: - default-plugins.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: queue-scorer - - type: kv-cache-utilization-scorer - - type: prefix-cache-scorer - {{- if .Values.inferenceExtension.latencyPredictor.enabled }} - - type: predicted-latency-scorer - parameters: - {{- with .Values.inferenceExtension.latencyPredictor.sloAwareRouting | default dict }} - samplingMean: {{ .samplingMean | default 1000.0 }} - maxSampledTokens: {{ .maxSampledTokens | default 20 }} - sloBufferFactor: {{ .sloBufferFactor | default 1.0 }} - negHeadroomTTFTWeight: {{ .negHeadroomTTFTWeight | default 0.8 }} - negHeadroomTPOTWeight: {{ .negHeadroomTPOTWeight | default 0.2 }} - headroomTTFTWeight: {{ .headroomTTFTWeight | default 0.8 }} - headroomTPOTWeight: {{ .headroomTPOTWeight | default 0.2 }} - headroomSelectionStrategy: {{ .headroomSelectionStrategy | default "least" | quote }} - compositeKVWeight: {{ .compositeKVWeight | default 1.0 }} - compositeQueueWeight: {{ .compositeQueueWeight | default 1.0 }} - compositePrefixWeight: {{ .compositePrefixWeight | default 1.0 }} - epsilonExploreSticky: {{ .epsilonExploreSticky | default 0.01 }} - epsilonExploreNeg: {{ .epsilonExploreNeg | default 0.01 }} - affinityGateTau: {{ .affinityGateTau | default 0.80 }} - affinityGateTauGlobal: {{ .affinityGateTauGlobal | default 0.99 }} - selectionMode: {{ .selectionMode | default "linear" | quote }} - {{- end }} - {{- end }} - schedulingProfiles: - {{- if .Values.inferenceExtension.latencyPredictor.enabled }} - - name: default - plugins: - - pluginRef: predicted-latency-scorer - featureGates: - - prepareDataPlugins - {{- else }} - - name: default - plugins: - - pluginRef: queue-scorer - weight: 2 - - pluginRef: kv-cache-utilization-scorer - weight: 2 - - pluginRef: prefix-cache-scorer - weight: 3 - {{- end }} - {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }} - {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} - {{- end }} - ---- -{{- if and .Values.inferenceExtension.sidecar.enabled .Values.inferenceExtension.sidecar.configMapData }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "gateway-api-inference-extension.name" . }}-sidecar - namespace: {{ .Release.Namespace }} -data: - {{- .Values.inferenceExtension.sidecar.configMapData | toYaml | nindent 2 }} -{{- end }} ---- -{{- if .Values.inferenceExtension.latencyPredictor.enabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training - namespace: {{ .Release.Namespace }} -data: - {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }} - {{ $key }}: {{ $value | quote }} -{{- end }} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-prediction - namespace: {{ .Release.Namespace }} -data: - {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.predictionServers.config }} - {{ $key }}: {{ $value | quote }} - {{- end }} -{{- end }} +{{- include "inference-extension.config" . -}} diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index bd526ec4e2..4eaba71d28 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -1,207 +1 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} -spec: - replicas: {{ .Values.inferenceExtension.replicas | default 1 }} - strategy: - # The current recommended EPP deployment pattern is to have a single active replica. This ensures - # optimal performance of the stateful operations such prefix cache aware scorer. - # The Recreate strategy the old replica is killed immediately, and allow the new replica(s) to - # quickly take over. This is particularly important in the high availability set up with leader - # election, as the rolling update strategy would prevent the old leader being killed because - # otherwise the maxUnavailable would be 100%. - type: Recreate - selector: - matchLabels: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} - spec: - serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} - # Conservatively, this timeout should mirror the longest grace period of the pods within the pool - terminationGracePeriodSeconds: 130 - containers: - {{- if .Values.inferenceExtension.sidecar.enabled }} - - name: {{ .Values.inferenceExtension.sidecar.name }} - image: {{ .Values.inferenceExtension.sidecar.image }} - imagePullPolicy: {{ .Values.inferenceExtension.sidecar.imagePullPolicy | default "IfNotPresent" }} - {{- with .Values.inferenceExtension.sidecar.command }} - command: - - {{ . | quote }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.args }} - args: - {{- range . }} - - {{ . | quote }} - {{- end }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.env }} - env: - {{- toYaml . | nindent 10 }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.ports }} - ports: - {{- toYaml . | nindent 10 }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.livenessProbe }} - livenessProbe: - {{- toYaml . | nindent 10 }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.readinessProbe }} - readinessProbe: - {{- toYaml . | nindent 10 }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.resources }} - resources: - {{- toYaml . | nindent 10 }} - {{- end }} - {{- with .Values.inferenceExtension.sidecar.volumeMounts }} - volumeMounts: - {{- toYaml . | nindent 10 }} - {{- end }} - {{- end }} - - name: epp - image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} - imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "IfNotPresent" }} - args: - - --pool-name - - {{ .Release.Name }} - # The pool namespace is optional because EPP can default to the NAMESPACE env var. - # We still keep this here so that the template works with older versions of EPP, or other - # distros of EPP which may not have implemented the NAMESPACE env var defaulting behavior. - - --pool-namespace - - {{ .Release.Namespace }} - {{- if ne .Values.inferencePool.apiVersion "inference.networking.k8s.io" }} - - --pool-group - - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" - {{- end }} - - --zap-encoder - - "json" - - --config-file - - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}" - {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }} - - --total-queued-requests-metric - - "nv_trt_llm_request_metrics{request_type=waiting}" - - --kv-cache-usage-percentage-metric - - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" - - --lora-info-metric - - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. - {{- end }} - {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} - - --ha-enable-leader-election - {{- end }} - # Pass additional flags via the inferenceExtension.flags field in values.yaml. - {{- range $key, $value := .Values.inferenceExtension.flags }} - - --{{ $key }} - - "{{ $value }}" - {{- end }} - {{- if .Values.inferenceExtension.tracing.enabled }} - - --tracing=true - {{- else }} - - --tracing=false - {{- end }} - {{- if not .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} - - --metrics-endpoint-auth=false - {{- end }} - ports: - - name: grpc - containerPort: 9002 - - name: grpc-health - containerPort: 9003 - - name: metrics - containerPort: 9090 - {{- if .Values.inferenceExtension.extraContainerPorts }} - {{- toYaml .Values.inferenceExtension.extraContainerPorts | nindent 8 }} - {{- end }} - livenessProbe: - {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} - grpc: - port: 9003 - service: liveness - {{- else }} - grpc: - port: 9003 - service: inference-extension - {{- end }} - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} - grpc: - port: 9003 - service: readiness - {{- else }} - grpc: - port: 9003 - service: inference-extension - {{- end }} - periodSeconds: 2 - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - {{- include "gateway-api-inference-extension.latencyPredictor.env" . | nindent 8 }} - {{- if .Values.inferenceExtension.tracing.enabled }} - - name: OTEL_SERVICE_NAME - value: "gateway-api-inference-extension" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: {{ .Values.inferenceExtension.tracing.otelExporterEndpoint | quote }} - - name: OTEL_TRACES_EXPORTER - value: "otlp" - - name: OTEL_RESOURCE_ATTRIBUTES_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: metadata.name - - name: OTEL_RESOURCE_ATTRIBUTES - value: 'k8s.namespace.name=$(NAMESPACE),k8s.node.name=$(OTEL_RESOURCE_ATTRIBUTES_NODE_NAME),k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME)' - - name: OTEL_TRACES_SAMPLER - value: {{ .Values.inferenceExtension.tracing.sampling.sampler | quote }} - - name: OTEL_TRACES_SAMPLER_ARG - value: {{ .Values.inferenceExtension.tracing.sampling.samplerArg | quote }} - {{- end }} - {{- if .Values.inferenceExtension.env }} - {{- toYaml .Values.inferenceExtension.env | nindent 8 }} - {{- end }} - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - {{- if .Values.inferenceExtension.volumeMounts }} - {{- toYaml .Values.inferenceExtension.volumeMounts | nindent 8 }} - {{- end }} - {{- include "gateway-api-inference-extension.latencyPredictor.containers" . | nindent 6 }} - volumes: - {{- if .Values.inferenceExtension.volumes }} - {{- toYaml .Values.inferenceExtension.volumes | nindent 6 }} - {{- end }} - {{- if .Values.inferenceExtension.sidecar.volumes }} - {{- tpl (toYaml .Values.inferenceExtension.sidecar.volumes) $ | nindent 6 }} - {{- end }} - - name: plugins-config-volume - configMap: - name: {{ include "gateway-api-inference-extension.name" . }} - {{- include "gateway-api-inference-extension.latencyPredictor.volumes" . | nindent 6 }} - {{- if .Values.inferenceExtension.affinity }} - affinity: - {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }} - {{- end }} - {{- if .Values.inferenceExtension.tolerations }} - tolerations: - {{- toYaml .Values.inferenceExtension.tolerations | nindent 8 }} - {{- end }} +{{- include "inference-extension.deployment" . -}} diff --git a/config/charts/inferencepool/templates/epp-leader-election-rbac.yaml b/config/charts/inferencepool/templates/epp-leader-election-rbac.yaml new file mode 100644 index 0000000000..6820306788 --- /dev/null +++ b/config/charts/inferencepool/templates/epp-leader-election-rbac.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.lead-election-rbac" . -}} diff --git a/config/charts/inferencepool/templates/epp-sa-token-secret.yaml b/config/charts/inferencepool/templates/epp-sa-token-secret.yaml index 16d935f965..ec13d9dce8 100644 --- a/config/charts/inferencepool/templates/epp-sa-token-secret.yaml +++ b/config/charts/inferencepool/templates/epp-sa-token-secret.yaml @@ -1,12 +1 @@ -{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled (ne (lower .Values.provider.name) "gke") }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} - annotations: - kubernetes.io/service-account.name: {{ include "gateway-api-inference-extension.name" . }} -type: kubernetes.io/service-account-token -{{- end }} \ No newline at end of file +{{- include "inference-extension.sa-token-secret" . -}} diff --git a/config/charts/inferencepool/templates/epp-service-monitor.yaml b/config/charts/inferencepool/templates/epp-service-monitor.yaml new file mode 100644 index 0000000000..2e5f7a0d3e --- /dev/null +++ b/config/charts/inferencepool/templates/epp-service-monitor.yaml @@ -0,0 +1 @@ +{{- include "inference-extension.service-monitor" . -}} diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml index 2d476e1826..fb6ab40573 100644 --- a/config/charts/inferencepool/templates/epp-service.yaml +++ b/config/charts/inferencepool/templates/epp-service.yaml @@ -1,21 +1 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} -spec: - selector: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} - ports: - - name: grpc-ext-proc - protocol: TCP - port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} - - name: http-metrics - protocol: TCP - port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} - {{- with .Values.inferenceExtension.extraServicePorts }} - {{- . | toYaml | nindent 4 }} - {{- end }} - type: ClusterIP +{{- include "inference-extension.service" . -}} diff --git a/config/charts/inferencepool/templates/gke.yaml b/config/charts/inferencepool/templates/gke.yaml index 2ee2e13fc9..e8b4ab8d91 100644 --- a/config/charts/inferencepool/templates/gke.yaml +++ b/config/charts/inferencepool/templates/gke.yaml @@ -20,8 +20,8 @@ spec: config: type: HTTP httpHealthCheck: - requestPath: /health - port: {{ .Values.inferencePool.targetPortNumber }} + requestPath: /health + port: {{ .Values.inferencePool.targetPortNumber }} --- apiVersion: networking.gke.io/v1 kind: GCPBackendPolicy @@ -40,107 +40,5 @@ spec: logging: enabled: true # log all requests by default --- -{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} -{{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}} -{{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}} -{{- $metricsReadRoleName := printf "%s-%s-metrics-reader" .Release.Namespace .Release.Name -}} -{{- $metricsReadRoleBindingName := printf "%s-%s-metrics-reader-role-binding" .Release.Namespace .Release.Name -}} -{{- $secretReadRoleName := printf "%s-metrics-reader-secret-read" .Release.Name -}} -{{- $gmpNamespace := "gmp-system" -}} -{{- $isAutopilot := false -}} -{{- with .Values.provider.gke }} - {{- $isAutopilot = .autopilot | default false -}} -{{- end }} -{{- if $isAutopilot -}} -{{- $gmpNamespace = "gke-gmp-system" -}} -{{- end -}} -{{- $gmpCollectorRoleBindingName := printf "%s:collector:%s-%s-metrics-reader-secret-read" $gmpNamespace .Release.Namespace .Release.Name -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ $metricsReadSA }} - namespace: {{ .Release.Namespace }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ $metricsReadSecretName }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} - annotations: - kubernetes.io/service-account.name: {{ $metricsReadSA }} -type: kubernetes.io/service-account-token ---- -apiVersion: monitoring.googleapis.com/v1 -kind: PodMonitoring -metadata: - name: {{ .Release.Name }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} -spec: - endpoints: - - port: metrics - scheme: http - interval: {{ .Values.inferenceExtension.monitoring.interval }} - path: /metrics - authorization: - type: Bearer - credentials: - secret: - name: {{ $metricsReadSecretName }} - key: token - selector: - matchLabels: - {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 8 }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ $metricsReadRoleName }} -rules: -- nonResourceURLs: - - /metrics - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ $metricsReadRoleBindingName }} -subjects: -- kind: ServiceAccount - name: {{ $metricsReadSA }} - namespace: {{ .Release.Namespace }} -roleRef: - kind: ClusterRole - name: {{ $metricsReadRoleName }} - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ $secretReadRoleName }} -rules: -- resources: - - secrets - apiGroups: [""] - verbs: ["get", "list", "watch"] - resourceNames: [{{ $metricsReadSecretName | quote }}] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ $gmpCollectorRoleBindingName }} - namespace: {{ .Release.Namespace }} -roleRef: - name: {{ $secretReadRoleName }} - kind: Role - apiGroup: rbac.authorization.k8s.io -subjects: -- name: collector - namespace: {{ $gmpNamespace }} - kind: ServiceAccount -{{- end }} {{- end }} +{{- include "inference-extension.gke" . -}} diff --git a/config/charts/inferencepool/templates/httproute.yaml b/config/charts/inferencepool/templates/httproute.yaml index a280d1581b..c448c7a430 100644 --- a/config/charts/inferencepool/templates/httproute.yaml +++ b/config/charts/inferencepool/templates/httproute.yaml @@ -24,6 +24,8 @@ spec: name: X-Gateway-Base-Model-Name value: {{ .Values.experimentalHttpRoute.baseModel }} {{- end }} + {{- if ne (lower .Values.provider.name) "gke" }} timeouts: request: 300s + {{- end }} {{- end }} diff --git a/config/charts/inferencepool/templates/inferencepool.yaml b/config/charts/inferencepool/templates/inferencepool.yaml index 5c973b9983..796c832211 100644 --- a/config/charts/inferencepool/templates/inferencepool.yaml +++ b/config/charts/inferencepool/templates/inferencepool.yaml @@ -44,5 +44,3 @@ spec: port: number: {{ .Values.inferenceExtension.extProcPort | default 9002 }} {{- end }} - - diff --git a/config/charts/inferencepool/templates/rbac.yaml b/config/charts/inferencepool/templates/rbac.yaml index dc6b3e0c4a..c2fed9ab23 100644 --- a/config/charts/inferencepool/templates/rbac.yaml +++ b/config/charts/inferencepool/templates/rbac.yaml @@ -1,46 +1,7 @@ -{{- if .Values.inferenceExtension.monitoring.prometheus.enabled }} -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create -- nonResourceURLs: - - "/metrics" - verbs: - - get ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} -subjects: -- kind: ServiceAccount - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "gateway-api-inference-extension.cluster-rbac-name" . }} -{{- end }} ---- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ printf "%s-non-sa" (include "gateway-api-inference-extension.name" .) }} namespace: {{ .Release.Namespace }} labels: {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} @@ -51,14 +12,11 @@ rules: - apiGroups: ["{{ (split "/" .Values.inferencePool.apiVersion)._0 }}"] resources: ["inferencepools"] verbs: ["get", "watch", "list"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "watch", "list"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ printf "%s-non-sa" (include "gateway-api-inference-extension.name" .) }} namespace: {{ .Release.Namespace }} subjects: - kind: ServiceAccount @@ -67,12 +25,6 @@ subjects: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: {{ include "gateway-api-inference-extension.name" . }} + name: {{ printf "%s-non-sa" (include "gateway-api-inference-extension.name" .) }} --- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "gateway-api-inference-extension.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +{{- include "inference-extension.rbac" . -}} diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index c6cc51e8f9..1803e2cbaa 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -43,7 +43,7 @@ inferenceExtension: affinity: {} tolerations: [] - + # Sidecar configuration for EPP sidecar: enabled: false @@ -70,7 +70,7 @@ inferenceExtension: # Latency Predictor Configuration latencyPredictor: enabled: false - + # Training Server Configuration trainingServer: image: @@ -154,12 +154,12 @@ inferencePool: targetPorts: - number: 8000 modelServerType: vllm # vllm, triton-tensorrt-llm - apiVersion: inference.networking.k8s.io/v1 + apiVersion: inference.networking.k8s.io/v1 # modelServers: # REQUIRED # matchLabels: # app: vllm-llama3-8b-instruct - # Should only used if apiVersion is inference.networking.x-k8s.io/v1alpha2, + # Should only used if apiVersion is inference.networking.x-k8s.io/v1alpha2, # This will soon be deprecated when upstream GW providers support v1, just doing something simple for now. targetPortNumber: 8000 diff --git a/hack/verify-helm.sh b/hack/verify-helm.sh index 0388b6e24d..e9bb8dd09c 100755 --- a/hack/verify-helm.sh +++ b/hack/verify-helm.sh @@ -27,6 +27,13 @@ test_cases_inference_pool["multiple-replicas"]="--set inferencePool.replicas=3 - # source (such as in the verify-all script) make helm-install +echo "Building dependencies for inferencePool chart..." +${SCRIPT_ROOT}/bin/helm dependency build ${SCRIPT_ROOT}/config/charts/inferencepool +if [ $? -ne 0 ]; then + echo "Helm dependency build failed." + exit 1 +fi + # Running tests cases echo "Running helm template command for inferencePool chart..." # Loop through the keys of the associative array diff --git a/site-src/_includes/epp-latest.md b/site-src/_includes/epp-latest.md index 0867537549..8f10292c5c 100644 --- a/site-src/_includes/epp-latest.md +++ b/site-src/_includes/epp-latest.md @@ -3,6 +3,7 @@ ```bash export GATEWAY_PROVIDER=gke helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ @@ -14,6 +15,7 @@ ```bash export GATEWAY_PROVIDER=istio helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ @@ -25,6 +27,7 @@ ```bash export GATEWAY_PROVIDER=none helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ @@ -36,6 +39,7 @@ ```bash export GATEWAY_PROVIDER=none helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ diff --git a/site-src/_includes/epp.md b/site-src/_includes/epp.md index df0a7b6a89..61ffb9be8d 100644 --- a/site-src/_includes/epp.md +++ b/site-src/_includes/epp.md @@ -3,6 +3,7 @@ ```bash export GATEWAY_PROVIDER=gke helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --set experimentalHttpRoute.enabled=true \ @@ -15,6 +16,7 @@ ```bash export GATEWAY_PROVIDER=istio helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --set experimentalHttpRoute.enabled=true \ @@ -27,6 +29,7 @@ ```bash export GATEWAY_PROVIDER=none helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --set experimentalHttpRoute.enabled=true \ @@ -39,6 +42,7 @@ ```bash export GATEWAY_PROVIDER=none helm install vllm-llama3-8b-instruct \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ --set provider.name=$GATEWAY_PROVIDER \ --set experimentalHttpRoute.enabled=true \ diff --git a/site-src/guides/getting-started-latest.md b/site-src/guides/getting-started-latest.md index 10b0aeb5b8..94b6625e2e 100644 --- a/site-src/guides/getting-started-latest.md +++ b/site-src/guides/getting-started-latest.md @@ -106,7 +106,7 @@ kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extens 1. Install NGINX Gateway Fabric with the Inference Extension enabled by setting the `nginxGateway.gwAPIInferenceExtension.enable=true` Helm value ```bash - helm install ngf oci://ghcr.io/nginx/charts/nginx-gateway-fabric --create-namespace -n nginx-gateway --set nginxGateway.gwAPIInferenceExtension.enable=true + helm install ngf oci://ghcr.io/nginx/charts/nginx-gateway-fabric --create-namespace -n nginx-gateway --dependency-update --set nginxGateway.gwAPIInferenceExtension.enable=true ``` This enables NGINX Gateway Fabric to watch and manage Inference Extension resources such as InferencePool and InferenceObjective. diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md index f1b8185d8e..78b5026f8d 100644 --- a/site-src/guides/serve-multiple-genai-models.md +++ b/site-src/guides/serve-multiple-genai-models.md @@ -160,6 +160,7 @@ Select a tab to follow the provider-specific instructions. ```bash export GATEWAY_PROVIDER=gke helm install vllm-deepseek-r1 \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ @@ -171,6 +172,7 @@ Select a tab to follow the provider-specific instructions. ```bash export GATEWAY_PROVIDER=istio helm install vllm-deepseek-r1 \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ @@ -181,6 +183,7 @@ Select a tab to follow the provider-specific instructions. ```bash export GATEWAY_PROVIDER=none helm install vllm-deepseek-r1 \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \ @@ -192,6 +195,7 @@ Select a tab to follow the provider-specific instructions. ```bash export GATEWAY_PROVIDER=none helm install vllm-deepseek-r1 \ + --dependency-update \ --set inferencePool.modelServers.matchLabels.app=vllm-deepseek-r1 \ --set provider.name=$GATEWAY_PROVIDER \ --version $IGW_CHART_VERSION \