Skip to content

Commit d889ab4

Browse files
kaushikmitrBenjaminBraunDev
authored andcommitted
seperate servers for training and prediction
1 parent 075d458 commit d889ab4

16 files changed

+5007
-134
lines changed
Lines changed: 382 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,382 @@
1+
# Note: If you change this file, please also change the file used for e2e tests!
2+
#
3+
# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
4+
5+
# --- ConfigMaps ---
6+
apiVersion: v1
7+
kind: ConfigMap
8+
metadata:
9+
name: latency-predictor-config
10+
namespace: default
11+
data:
12+
LATENCY_RETRAINING_INTERVAL_SEC: "1"
13+
LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100"
14+
LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
15+
LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
16+
LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
17+
LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
18+
LATENCY_MODEL_TYPE: "xgboost"
19+
LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000"
20+
21+
---
22+
apiVersion: v1
23+
kind: ConfigMap
24+
metadata:
25+
name: prediction-server-config
26+
namespace: default
27+
data:
28+
LATENCY_MODEL_TYPE: "xgboost"
29+
PREDICT_HOST: "0.0.0.0"
30+
LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" # Use individual storage
31+
LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
32+
LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
33+
LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"
34+
35+
---
36+
# --- InferencePool ---
37+
apiVersion: inference.networking.x-k8s.io/v1alpha2
38+
kind: InferencePool
39+
metadata:
40+
name: vllm-llama3-8b-instruct
41+
spec:
42+
targetPortNumber: 8000
43+
selector:
44+
app: vllm-llama3-8b-instruct
45+
extensionRef:
46+
name: vllm-llama3-8b-instruct-epp
47+
48+
---
49+
# --- EPP Service ---
50+
apiVersion: v1
51+
kind: Service
52+
metadata:
53+
name: vllm-llama3-8b-instruct-epp
54+
namespace: default
55+
spec:
56+
selector:
57+
app: vllm-llama3-8b-instruct-epp
58+
ports:
59+
- name: epp-grpc
60+
protocol: TCP
61+
port: 9002
62+
targetPort: 9002
63+
appProtocol: http2
64+
- name: latency-predictor-training
65+
protocol: TCP
66+
port: 8000
67+
targetPort: 8000
68+
- name: latency-predictor-1
69+
protocol: TCP
70+
port: 8001
71+
targetPort: 8001
72+
- name: latency-predictor-2
73+
protocol: TCP
74+
port: 8002
75+
targetPort: 8002
76+
- name: latency-predictor-3
77+
protocol: TCP
78+
port: 8003
79+
targetPort: 8003
80+
- name: prometheus
81+
protocol: TCP
82+
port: 9090
83+
targetPort: 9090
84+
type: LoadBalancer
85+
86+
---
87+
# --- EPP Deployment with Individual Container Volumes ---
88+
apiVersion: apps/v1
89+
kind: Deployment
90+
metadata:
91+
name: vllm-llama3-8b-instruct-epp
92+
namespace: default
93+
labels:
94+
app: vllm-llama3-8b-instruct-epp
95+
spec:
96+
replicas: 1 # Multiple EPP pods for scaling
97+
selector:
98+
matchLabels:
99+
app: vllm-llama3-8b-instruct-epp
100+
template:
101+
metadata:
102+
labels:
103+
app: vllm-llama3-8b-instruct-epp
104+
spec:
105+
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
106+
terminationGracePeriodSeconds: 130
107+
containers:
108+
# EPP Container
109+
- name: epp
110+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/epp-ig-latencypredictor:latest
111+
imagePullPolicy: Always
112+
args:
113+
- -poolName
114+
- "vllm-llama3-8b-instruct"
115+
- "-poolNamespace"
116+
- "default"
117+
- -v
118+
- "4"
119+
- --zap-encoder
120+
- "json"
121+
- -grpcPort
122+
- "9002"
123+
- -grpcHealthPort
124+
- "9003"
125+
- "-enable-latency-predictor"
126+
env:
127+
- name: PREDICTION_SERVER_URL
128+
value: "http://localhost:8001,http://localhost:8002,http://localhost:8003" # Multiple prediction servers
129+
- name: TRAINING_SERVER_URL
130+
value: "http://localhost:8000" # Single training server for sending training data
131+
- name: LATENCY_MAX_SAMPLE_SIZE
132+
value: "10000" # Maximum sample size for latency prediction
133+
ports:
134+
- containerPort: 9002
135+
- containerPort: 9003
136+
- name: metrics
137+
containerPort: 9090
138+
livenessProbe:
139+
grpc:
140+
port: 9003
141+
service: inference-extension
142+
initialDelaySeconds: 5
143+
periodSeconds: 10
144+
readinessProbe:
145+
grpc:
146+
port: 9003
147+
service: inference-extension
148+
initialDelaySeconds: 5
149+
periodSeconds: 10
150+
# Training Server Sidecar Container
151+
- name: training-server
152+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-training-server:latest
153+
imagePullPolicy: Always
154+
ports:
155+
- containerPort: 8000
156+
name: training-port
157+
livenessProbe:
158+
httpGet:
159+
path: /healthz
160+
port: 8000
161+
initialDelaySeconds: 30
162+
periodSeconds: 20
163+
readinessProbe:
164+
httpGet:
165+
path: /readyz
166+
port: 8000
167+
initialDelaySeconds: 45
168+
periodSeconds: 10
169+
resources:
170+
requests:
171+
cpu: "2000m"
172+
memory: "4Gi"
173+
limits:
174+
cpu: "4000m"
175+
memory: "8Gi"
176+
envFrom:
177+
- configMapRef:
178+
name: latency-predictor-config
179+
env:
180+
- name: POD_NAME
181+
valueFrom:
182+
fieldRef:
183+
fieldPath: metadata.name
184+
- name: SERVER_TYPE
185+
value: "training"
186+
volumeMounts:
187+
- name: training-server-storage
188+
mountPath: /models
189+
# Prediction Server Sidecar Container 1
190+
- name: prediction-server-1
191+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
192+
imagePullPolicy: Always
193+
command: ["uvicorn"]
194+
args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8001"]
195+
ports:
196+
- containerPort: 8001
197+
name: predict-port-1
198+
livenessProbe:
199+
httpGet:
200+
path: /healthz
201+
port: 8001
202+
initialDelaySeconds: 15
203+
periodSeconds: 15
204+
readinessProbe:
205+
httpGet:
206+
path: /readyz
207+
port: 8001
208+
initialDelaySeconds: 10
209+
periodSeconds: 5
210+
failureThreshold: 10
211+
resources:
212+
requests:
213+
cpu: "500m"
214+
memory: "1Gi"
215+
limits:
216+
cpu: "1000m"
217+
memory: "2Gi"
218+
envFrom:
219+
- configMapRef:
220+
name: prediction-server-config
221+
env:
222+
- name: PREDICT_PORT
223+
value: "8001"
224+
- name: POD_NAME
225+
valueFrom:
226+
fieldRef:
227+
fieldPath: metadata.name
228+
- name: SERVER_TYPE
229+
value: "prediction-1"
230+
- name: TRAINING_SERVER_URL
231+
value: "http://localhost:8000"
232+
volumeMounts:
233+
- name: prediction-server-1-storage
234+
mountPath: /server_models
235+
# Prediction Server Sidecar Container 2
236+
- name: prediction-server-2
237+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
238+
imagePullPolicy: Always
239+
command: ["uvicorn"]
240+
args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8002"]
241+
ports:
242+
- containerPort: 8002
243+
name: predict-port-2
244+
livenessProbe:
245+
httpGet:
246+
path: /healthz
247+
port: 8002
248+
initialDelaySeconds: 15
249+
periodSeconds: 15
250+
readinessProbe:
251+
httpGet:
252+
path: /readyz
253+
port: 8002
254+
initialDelaySeconds: 10
255+
periodSeconds: 5
256+
failureThreshold: 10
257+
resources:
258+
requests:
259+
cpu: "500m"
260+
memory: "1Gi"
261+
limits:
262+
cpu: "1000m"
263+
memory: "2Gi"
264+
envFrom:
265+
- configMapRef:
266+
name: prediction-server-config
267+
env:
268+
- name: PREDICT_PORT
269+
value: "8002"
270+
- name: POD_NAME
271+
valueFrom:
272+
fieldRef:
273+
fieldPath: metadata.name
274+
- name: SERVER_TYPE
275+
value: "prediction-2"
276+
- name: TRAINING_SERVER_URL
277+
value: "http://localhost:8000"
278+
volumeMounts:
279+
- name: prediction-server-2-storage
280+
mountPath: /server_models
281+
# Prediction Server Sidecar Container 3
282+
- name: prediction-server-3
283+
image: us-docker.pkg.dev/kaushikmitra-gke-dev/kaushikmitra-docker-repo/latencypredictor-v1-prediction-server:latest
284+
imagePullPolicy: Always
285+
command: ["uvicorn"]
286+
args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "8003"]
287+
ports:
288+
- containerPort: 8003
289+
name: predict-port-3
290+
livenessProbe:
291+
httpGet:
292+
path: /healthz
293+
port: 8003
294+
initialDelaySeconds: 15
295+
periodSeconds: 15
296+
readinessProbe:
297+
httpGet:
298+
path: /readyz
299+
port: 8003
300+
initialDelaySeconds: 10
301+
periodSeconds: 5
302+
failureThreshold: 10
303+
resources:
304+
requests:
305+
cpu: "500m"
306+
memory: "1Gi"
307+
limits:
308+
cpu: "1000m"
309+
memory: "2Gi"
310+
envFrom:
311+
- configMapRef:
312+
name: prediction-server-config
313+
env:
314+
- name: PREDICT_PORT
315+
value: "8003"
316+
- name: POD_NAME
317+
valueFrom:
318+
fieldRef:
319+
fieldPath: metadata.name
320+
- name: SERVER_TYPE
321+
value: "prediction-3"
322+
- name: TRAINING_SERVER_URL
323+
value: "http://localhost:8000"
324+
volumeMounts:
325+
- name: prediction-server-3-storage
326+
mountPath: /server_models
327+
volumes:
328+
- name: training-server-storage
329+
emptyDir:
330+
sizeLimit: "20Gi" # Dedicated volume for training server
331+
- name: prediction-server-1-storage
332+
emptyDir:
333+
sizeLimit: "10Gi" # Dedicated volume for prediction server 1
334+
- name: prediction-server-2-storage
335+
emptyDir:
336+
sizeLimit: "10Gi" # Dedicated volume for prediction server 2
337+
- name: prediction-server-3-storage
338+
emptyDir:
339+
sizeLimit: "10Gi" # Dedicated volume for prediction server 3
340+
341+
---
342+
# --- RBAC ---
343+
kind: ClusterRole
344+
apiVersion: rbac.authorization.k8s.io/v1
345+
metadata:
346+
name: pod-read
347+
rules:
348+
- apiGroups: ["inference.networking.x-k8s.io"]
349+
resources: ["inferencepools"]
350+
verbs: ["get", "watch", "list"]
351+
- apiGroups: ["inference.networking.x-k8s.io"]
352+
resources: ["inferencemodels"]
353+
verbs: ["get", "watch", "list"]
354+
- apiGroups: [""]
355+
resources: ["pods"]
356+
verbs: ["get", "watch", "list"]
357+
- apiGroups:
358+
- authentication.k8s.io
359+
resources:
360+
- tokenreviews
361+
verbs:
362+
- create
363+
- apiGroups:
364+
- authorization.k8s.io
365+
resources:
366+
- subjectaccessreviews
367+
verbs:
368+
- create
369+
370+
---
371+
kind: ClusterRoleBinding
372+
apiVersion: rbac.authorization.k8s.io/v1
373+
metadata:
374+
name: pod-read-binding
375+
subjects:
376+
- kind: ServiceAccount
377+
name: default
378+
namespace: default
379+
roleRef:
380+
apiGroup: rbac.authorization.k8s.io
381+
kind: ClusterRole
382+
name: pod-read

config/manifests/inferencepool-resources.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,11 @@ roleRef:
205205
periodSeconds: 10
206206
resources:
207207
requests:
208-
cpu: "1000m"
209-
memory: "2Gi"
208+
cpu: "8000m"
209+
memory: "8Gi"
210210
limits:
211-
cpu: "2000m"
212-
memory: "4Gi"
211+
cpu: "16000m"
212+
memory: "12Gi"
213213
envFrom:
214214
- configMapRef:
215215
name: latency-predictor-config

0 commit comments

Comments
 (0)