1+ # Note: If you change this file, please also change the file used for e2e tests!
2+ #
3+ # https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
4+
5+ # --- ConfigMaps ---
6+ apiVersion : v1
7+ kind : ConfigMap
8+ metadata :
9+ name : latency-predictor-config
10+ namespace : default
11+ data :
12+ LATENCY_RETRAINING_INTERVAL_SEC : " 1"
13+ LATENCY_MIN_SAMPLES_FOR_RETRAIN : " 100"
14+ LATENCY_TTFT_MODEL_PATH : " /models/ttft.joblib"
15+ LATENCY_TPOT_MODEL_PATH : " /models/tpot.joblib"
16+ LATENCY_TTFT_SCALER_PATH : " /models/ttft_scaler.joblib"
17+ LATENCY_TPOT_SCALER_PATH : " /models/tpot_scaler.joblib"
18+ LATENCY_MODEL_TYPE : " xgboost"
19+ LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET : " 5000"
20+
21+ ---
22+ apiVersion : v1
23+ kind : ConfigMap
24+ metadata :
25+ name : prediction-server-config
26+ namespace : default
27+ data :
28+ LATENCY_MODEL_TYPE : " xgboost"
29+ PREDICT_HOST : " 0.0.0.0"
30+ LOCAL_TTFT_MODEL_PATH : " /server_models/ttft.joblib" # Use individual storage
31+ LOCAL_TPOT_MODEL_PATH : " /server_models/tpot.joblib"
32+ LOCAL_TTFT_SCALER_PATH : " /server_models/ttft_scaler.joblib"
33+ LOCAL_TPOT_SCALER_PATH : " /server_models/tpot_scaler.joblib"
34+
35+ ---
36+ # --- InferencePool ---
37+ apiVersion : inference.networking.x-k8s.io/v1alpha2
38+ kind : InferencePool
39+ metadata :
40+ name : vllm-llama3-8b-instruct
41+ spec :
42+ targetPortNumber : 8000
43+ selector :
44+ app : vllm-llama3-8b-instruct
45+ extensionRef :
46+ name : vllm-llama3-8b-instruct-epp
47+
48+ ---
49+ # --- EPP Service ---
50+ apiVersion : v1
51+ kind : Service
52+ metadata :
53+ name : vllm-llama3-8b-instruct-epp
54+ namespace : default
55+ spec :
56+ selector :
57+ app : vllm-llama3-8b-instruct-epp
58+ ports :
59+ - name : epp-grpc
60+ protocol : TCP
61+ port : 9002
62+ targetPort : 9002
63+ appProtocol : http2
64+ - name : latency-predictor-training
65+ protocol : TCP
66+ port : 8000
67+ targetPort : 8000
68+ - name : latency-predictor-1
69+ protocol : TCP
70+ port : 8001
71+ targetPort : 8001
72+ - name : latency-predictor-2
73+ protocol : TCP
74+ port : 8002
75+ targetPort : 8002
76+ - name : latency-predictor-3
77+ protocol : TCP
78+ port : 8003
79+ targetPort : 8003
80+ - name : prometheus
81+ protocol : TCP
82+ port : 9090
83+ targetPort : 9090
84+ type : LoadBalancer
85+
86+ ---
87+ # --- EPP Deployment with Individual Container Volumes ---
88+ apiVersion : apps/v1
89+ kind : Deployment
90+ metadata :
91+ name : vllm-llama3-8b-instruct-epp
92+ namespace : default
93+ labels :
94+ app : vllm-llama3-8b-instruct-epp
95+ spec :
96+ replicas : 1 # Multiple EPP pods for scaling
97+ selector :
98+ matchLabels :
99+ app : vllm-llama3-8b-instruct-epp
100+ template :
101+ metadata :
102+ labels :
103+ app : vllm-llama3-8b-instruct-epp
104+ spec :
105+ # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
106+ terminationGracePeriodSeconds : 130
107+ containers :
108+ # EPP Container
109+ - name : epp
110+ image : us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/epp-ig-latencypredictor:latest
111+ imagePullPolicy : Always
112+ args :
113+ - -poolName
114+ - " vllm-llama3-8b-instruct"
115+ - " -poolNamespace"
116+ - " default"
117+ - -v
118+ - " 4"
119+ - --zap-encoder
120+ - " json"
121+ - -grpcPort
122+ - " 9002"
123+ - -grpcHealthPort
124+ - " 9003"
125+ - " -enable-latency-predictor"
126+ env :
127+ - name : PREDICTION_SERVER_URL
128+ value : " http://localhost:8001,http://localhost:8002,http://localhost:8003" # Multiple prediction servers
129+ - name : TRAINING_SERVER_URL
130+ value : " http://localhost:8000" # Single training server for sending training data
131+ - name : LATENCY_MAX_SAMPLE_SIZE
132+ value : " 10000" # Maximum sample size for latency prediction
133+ ports :
134+ - containerPort : 9002
135+ - containerPort : 9003
136+ - name : metrics
137+ containerPort : 9090
138+ livenessProbe :
139+ grpc :
140+ port : 9003
141+ service : inference-extension
142+ initialDelaySeconds : 5
143+ periodSeconds : 10
144+ readinessProbe :
145+ grpc :
146+ port : 9003
147+ service : inference-extension
148+ initialDelaySeconds : 5
149+ periodSeconds : 10
150+ # Training Server Sidecar Container
151+ - name : training-server
152+ image : us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-training-server:latest
153+ imagePullPolicy : Always
154+ ports :
155+ - containerPort : 8000
156+ name : training-port
157+ livenessProbe :
158+ httpGet :
159+ path : /healthz
160+ port : 8000
161+ initialDelaySeconds : 30
162+ periodSeconds : 20
163+ readinessProbe :
164+ httpGet :
165+ path : /readyz
166+ port : 8000
167+ initialDelaySeconds : 45
168+ periodSeconds : 10
169+ resources :
170+ requests :
171+ cpu : " 2000m"
172+ memory : " 4Gi"
173+ limits :
174+ cpu : " 4000m"
175+ memory : " 8Gi"
176+ envFrom :
177+ - configMapRef :
178+ name : latency-predictor-config
179+ env :
180+ - name : POD_NAME
181+ valueFrom :
182+ fieldRef :
183+ fieldPath : metadata.name
184+ - name : SERVER_TYPE
185+ value : " training"
186+ volumeMounts :
187+ - name : training-server-storage
188+ mountPath : /models
189+ # Prediction Server Sidecar Container 1
190+ - name : prediction-server-1
191+ image : us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
192+ imagePullPolicy : Always
193+ command : ["uvicorn"]
194+ args : ["prediction_server:app", "--host", "0.0.0.0", "--port", "8001"]
195+ ports :
196+ - containerPort : 8001
197+ name : predict-port-1
198+ livenessProbe :
199+ httpGet :
200+ path : /healthz
201+ port : 8001
202+ initialDelaySeconds : 15
203+ periodSeconds : 15
204+ readinessProbe :
205+ httpGet :
206+ path : /readyz
207+ port : 8001
208+ initialDelaySeconds : 10
209+ periodSeconds : 5
210+ failureThreshold : 10
211+ resources :
212+ requests :
213+ cpu : " 500m"
214+ memory : " 1Gi"
215+ limits :
216+ cpu : " 1000m"
217+ memory : " 2Gi"
218+ envFrom :
219+ - configMapRef :
220+ name : prediction-server-config
221+ env :
222+ - name : PREDICT_PORT
223+ value : " 8001"
224+ - name : POD_NAME
225+ valueFrom :
226+ fieldRef :
227+ fieldPath : metadata.name
228+ - name : SERVER_TYPE
229+ value : " prediction-1"
230+ - name : TRAINING_SERVER_URL
231+ value : " http://localhost:8000"
232+ volumeMounts :
233+ - name : prediction-server-1-storage
234+ mountPath : /server_models
235+ # Prediction Server Sidecar Container 2
236+ - name : prediction-server-2
237+ image : us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
238+ imagePullPolicy : Always
239+ command : ["uvicorn"]
240+ args : ["prediction_server:app", "--host", "0.0.0.0", "--port", "8002"]
241+ ports :
242+ - containerPort : 8002
243+ name : predict-port-2
244+ livenessProbe :
245+ httpGet :
246+ path : /healthz
247+ port : 8002
248+ initialDelaySeconds : 15
249+ periodSeconds : 15
250+ readinessProbe :
251+ httpGet :
252+ path : /readyz
253+ port : 8002
254+ initialDelaySeconds : 10
255+ periodSeconds : 5
256+ failureThreshold : 10
257+ resources :
258+ requests :
259+ cpu : " 500m"
260+ memory : " 1Gi"
261+ limits :
262+ cpu : " 1000m"
263+ memory : " 2Gi"
264+ envFrom :
265+ - configMapRef :
266+ name : prediction-server-config
267+ env :
268+ - name : PREDICT_PORT
269+ value : " 8002"
270+ - name : POD_NAME
271+ valueFrom :
272+ fieldRef :
273+ fieldPath : metadata.name
274+ - name : SERVER_TYPE
275+ value : " prediction-2"
276+ - name : TRAINING_SERVER_URL
277+ value : " http://localhost:8000"
278+ volumeMounts :
279+ - name : prediction-server-2-storage
280+ mountPath : /server_models
281+ # Prediction Server Sidecar Container 3
282+ - name : prediction-server-3
283+ image : us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
284+ imagePullPolicy : Always
285+ command : ["uvicorn"]
286+ args : ["prediction_server:app", "--host", "0.0.0.0", "--port", "8003"]
287+ ports :
288+ - containerPort : 8003
289+ name : predict-port-3
290+ livenessProbe :
291+ httpGet :
292+ path : /healthz
293+ port : 8003
294+ initialDelaySeconds : 15
295+ periodSeconds : 15
296+ readinessProbe :
297+ httpGet :
298+ path : /readyz
299+ port : 8003
300+ initialDelaySeconds : 10
301+ periodSeconds : 5
302+ failureThreshold : 10
303+ resources :
304+ requests :
305+ cpu : " 500m"
306+ memory : " 1Gi"
307+ limits :
308+ cpu : " 1000m"
309+ memory : " 2Gi"
310+ envFrom :
311+ - configMapRef :
312+ name : prediction-server-config
313+ env :
314+ - name : PREDICT_PORT
315+ value : " 8003"
316+ - name : POD_NAME
317+ valueFrom :
318+ fieldRef :
319+ fieldPath : metadata.name
320+ - name : SERVER_TYPE
321+ value : " prediction-3"
322+ - name : TRAINING_SERVER_URL
323+ value : " http://localhost:8000"
324+ volumeMounts :
325+ - name : prediction-server-3-storage
326+ mountPath : /server_models
327+ volumes :
328+ - name : training-server-storage
329+ emptyDir :
330+ sizeLimit : " 20Gi" # Dedicated volume for training server
331+ - name : prediction-server-1-storage
332+ emptyDir :
333+ sizeLimit : " 10Gi" # Dedicated volume for prediction server 1
334+ - name : prediction-server-2-storage
335+ emptyDir :
336+ sizeLimit : " 10Gi" # Dedicated volume for prediction server 2
337+ - name : prediction-server-3-storage
338+ emptyDir :
339+ sizeLimit : " 10Gi" # Dedicated volume for prediction server 3
340+
341+ ---
342+ # --- RBAC ---
343+ kind : ClusterRole
344+ apiVersion : rbac.authorization.k8s.io/v1
345+ metadata :
346+ name : pod-read
347+ rules :
348+ - apiGroups : ["inference.networking.x-k8s.io"]
349+ resources : ["inferencepools"]
350+ verbs : ["get", "watch", "list"]
351+ - apiGroups : ["inference.networking.x-k8s.io"]
352+ resources : ["inferencemodels"]
353+ verbs : ["get", "watch", "list"]
354+ - apiGroups : [""]
355+ resources : ["pods"]
356+ verbs : ["get", "watch", "list"]
357+ - apiGroups :
358+ - authentication.k8s.io
359+ resources :
360+ - tokenreviews
361+ verbs :
362+ - create
363+ - apiGroups :
364+ - authorization.k8s.io
365+ resources :
366+ - subjectaccessreviews
367+ verbs :
368+ - create
369+
370+ ---
371+ kind : ClusterRoleBinding
372+ apiVersion : rbac.authorization.k8s.io/v1
373+ metadata :
374+ name : pod-read-binding
375+ subjects :
376+ - kind : ServiceAccount
377+ name : default
378+ namespace : default
379+ roleRef :
380+ apiGroup : rbac.authorization.k8s.io
381+ kind : ClusterRole
382+ name : pod-read
0 commit comments