1+ name : GKE Prefill Heavy Test
2+
3+ on :
4+ # Runs with a PR comment /run-gke-prefill-heavy
5+ issue_comment :
6+ types : [created]
7+ workflow_dispatch :
8+ inputs :
9+ pr_or_branch :
10+ description : ' Pull-request number or branch name to test'
11+ required : true
12+ default : ' main'
13+ type : string
14+
15+ permissions :
16+ contents : read
17+
18+ jobs :
19+ deploy_and_validate :
20+ if : >
21+ github.event_name == 'workflow_dispatch' ||
22+ (
23+ github.event_name == 'issue_comment' &&
24+ github.event.issue.pull_request &&
25+ github.event.issue.pull_request.base.ref == 'main' &&
26+ contains(github.event.comment.body, '/run-gke-prefill-heavy')
27+ &&
28+ (
29+ github.event.comment.author_association == 'OWNER' ||
30+ github.event.comment.author_association == 'MEMBER' ||
31+ github.event.comment.author_association == 'COLLABORATOR'
32+ )
33+ )
34+ name : Test on ${{ matrix.accelerator.name }}
35+ runs-on : ubuntu-latest
36+
37+ strategy :
38+ fail-fast : false
39+ max-parallel : 1
40+ matrix :
41+ accelerator :
42+ - name : GPU
43+ pod_readiness_sleep_seconds : 180
44+
45+ env :
46+ GCP_PROJECT_ID : llm-d-scale
47+ GKE_CLUSTER_NAME : llm-d-e2e-us-east5
48+ GKE_CLUSTER_ZONE : us-east5
49+ NAMESPACE : igw-prefill-heavy
50+ GATEWAY : gke-l7-regional-external-managed
51+ GATEWAY_TYPE : gke
52+ PR_OR_BRANCH : ${{ github.event.inputs.pr_or_branch || github.event.issue.number || github.event.number || 'actions' }}
53+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
54+ MODEL : meta-llama/Llama-3.1-8B-Instruct
55+ GSA_EMAIL : ${{ secrets.GCS_WORKLOAD_SA }}
56+ GCS_BUCKET : igw-e2e-benchmark-results
57+ KSA_NAME : igw-e2e-benchmark-sa
58+
59+ steps :
60+ - name : Checkout
61+ uses : actions/checkout@v4
62+ with :
63+ persist-credentials : false
64+
65+ - name : Determine if pr_or_branch is a PR number
66+ id : check_pr
67+ env :
68+ PR_OR_BRANCH : ${{ github.event.inputs.pr_or_branch }}
69+ shell : bash
70+ run : |
71+ echo "PR_OR_BRANCH=${PR_OR_BRANCH:-actions}" >> "$GITHUB_ENV"
72+ if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
73+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
74+ elif [[ "${{ github.event_name }}" = "pull_request" ]]; then
75+ echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
76+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
77+ else
78+ echo "is_pr=false" >> "$GITHUB_OUTPUT"
79+ fi
80+
81+ - name : Fetch and checkout PR
82+ if : steps.check_pr.outputs.is_pr == 'true'
83+ run : |
84+ git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH"
85+ git checkout pr-"$PR_OR_BRANCH"
86+
87+ - name : Checkout branch
88+ if : steps.check_pr.outputs.is_pr == 'false'
89+ run : git checkout "$PR_OR_BRANCH"
90+
91+ - name : Authenticate to Google Cloud
92+ id : auth
93+ uses : google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5
94+ with :
95+ credentials_json : ${{ secrets.GCP_SA_KEY }}
96+
97+ - name : Set up gcloud CLI and kubectl
98+ uses : google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397
99+ with :
100+ project_id : ${{ env.GCP_PROJECT_ID }}
101+ install_components : ' kubectl,gke-gcloud-auth-plugin'
102+
103+ - name : Get GKE credentials
104+ run : |
105+ gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
106+
107+ - name : Create namespace
108+ run : |
109+ kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists"
110+
111+ - name : Create hf-token secret
112+ run : |
113+ kubectl create secret generic hf-token \
114+ --from-literal="token=${{ secrets.HF_TOKEN }}" \
115+ --namespace "${NAMESPACE}" \
116+ --dry-run=client -o yaml | kubectl apply -f -
117+
118+ - name : Create and Annotate KSA for Workload Identity
119+ run : |
120+ kubectl create serviceaccount $KSA_NAME --namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
121+ kubectl annotate serviceaccount $KSA_NAME \
122+ iam.gke.io/gcp-service-account=$GSA_EMAIL \
123+ --overwrite \
124+ --namespace "${NAMESPACE}"
125+
126+ - name : Deploy Model Server and CRDs
127+ run : |
128+ cd config/manifests/vllm
129+ echo "Deploying Model Server..."
130+ kubectl apply -f gpu-deployment.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
131+ echo "Installing CRDs"
132+ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml
133+ echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
134+
135+ - name : Deploy InferencePool and Endpoint Picker Extension
136+ run : |
137+ export IGW_CHART_VERSION=v1.1.0
138+ helm install vllm-llama3-8b-instruct \
139+ --namespace $NAMESPACE \
140+ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
141+ --set provider.name=$GATEWAY_TYPE \
142+ --version $IGW_CHART_VERSION \
143+ oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool | tee ~/igw-prefill-heavy-deployment.log
144+ echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
145+
146+ - name : Wait for all pods to be ready
147+ run : |
148+ kubectl wait pod \
149+ --for=condition=Ready \
150+ --all \
151+ -n "${NAMESPACE}" \
152+ --timeout=25m
153+ sleep ${{ matrix.accelerator.pod_readiness_sleep_seconds }} # TODO: remove this once examples have readiness probes
154+ echo "✅ All pods are ready."
155+ kubectl get pods -n "${NAMESPACE}"
156+
157+ - name : Deploy Gateway
158+ run : |
159+ GATEWAY_NAME=inference-gateway
160+ kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
161+ kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
162+ echo "Deploying Gateway..."
163+ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/gateway.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
164+ echo "Deploying HTTPRoute..."
165+ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/httproute.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
166+ echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
167+
168+ - name : Wait for gateway to be ready
169+ run : |
170+ GATEWAY_NAME=inference-gateway
171+ kubectl wait gateway/${GATEWAY_NAME} \
172+ --for=condition=Programmed=True \
173+ -n "${NAMESPACE}" \
174+ --timeout=500s
175+ echo "✅ Gateway is ready."
176+ kubectl get gateway -n "${NAMESPACE}"
177+
178+ - name : Show deployment status
179+ run : |
180+ echo "=== Deployments ==="
181+ kubectl get deployments -n "${NAMESPACE}"
182+ echo ""
183+ echo "=== Pods ==="
184+ kubectl get pods -n "${NAMESPACE}"
185+ echo ""
186+ echo "=== Services ==="
187+ kubectl get svc -n "${NAMESPACE}"
188+ echo ""
189+ echo "=== Helm releases ==="
190+ helm list -n "${NAMESPACE}" || true
191+ echo ""
192+ echo "=== Inference Pools ==="
193+ kubectl get inferencepools -n "${NAMESPACE}" || true
194+ echo ""
195+ echo "=== HTTPRoutes ==="
196+ kubectl get httproutes -n "${NAMESPACE}" -o yaml || true
197+ echo ""
198+ echo "=== Gateway ==="
199+ kubectl get Gateway -n "${NAMESPACE}" || true
200+ echo ""
201+
202+ - name : Verify installation and run validation test
203+ run : |
204+ cd .github/scripts/e2e
205+ ./e2e-validate.sh -n "${NAMESPACE}" -v -m ${MODEL}
206+
207+ - name : Run benchmarking test
208+ run : |
209+ TIMESTAMP=$(date +"%Y-%m-%d-%H-%M-%S")
210+ cd benchmarking/single-workload
211+ host="${GATEWAY_HOST:-$(kubectl get gateway -n "$NAMESPACE" \
212+ -o jsonpath='{.items[0].status.addresses[0].value}' 2>/dev/null || true)}"
213+ if [[ -z "$host" ]]; then
214+ echo "Error: could not discover a Gateway address in namespace '$NAMESPACE'." >&2
215+ exit 1
216+ fi
217+ port=80
218+ svc_host="${host}:${port}"
219+ helm install prefill-heavy-benchmark ../inference-perf/ -f prefill-heavy-values.yaml \
220+ --namespace "${NAMESPACE}" \
221+ --create-namespace \
222+ --set hfToken="${HF_TOKEN}" \
223+ --set "config.server.base_url=http://${svc_host}" \
224+ --set "job.serviceAccountName=$KSA_NAME" \
225+ --set "job.image.tag=v0.2.0" \
226+ --set "config.storage.google_cloud_storage.bucket_name=${GCS_BUCKET}" \
227+ --set "config.storage.google_cloud_storage.path=${NAMESPACE}/${TIMESTAMP}" \
228+ --set "gcsPath=gs://${GCS_BUCKET}/datasets/billsum_conversations.json" \
229+ --set "config.data.path=/gcsDataset/gcs-dataset.json" \
230+ --set-string 'job.resources.limits.nvidia\.com/gpu=1'
231+
232+ - name : Wait for benchmarking job to finish
233+ run : |
234+ job_name=prefill-heavy-benchmark-inference-perf-job
235+ TIMEOUT_DURATION="7200s"
236+ if ! kubectl wait --for=condition=complete job/"$job_name" -n "$NAMESPACE" --timeout="$TIMEOUT_DURATION"; then
237+ echo "Error: Benchmark job $job_name did not complete successfully within $TIMEOUT_DURATION." >&2
238+ echo "--- Job Description ---" >&2
239+ kubectl describe job "$job_name" -n "$NAMESPACE" >&2
240+ echo "--- Pod Logs (Last 50 lines) ---" >&2
241+ kubectl logs -l job-name="$job_name" -n "$NAMESPACE" --all-containers=true --tail 50 >&2
242+ exit 1
243+ fi
244+ echo "✅ Benchmarking Job Completed."
245+
246+ - name : Collect and upload Kubernetes pod logs
247+ if : always()
248+ run : |
249+ mkdir -p pod-logs-inference-prefill-heavy
250+ cd pod-logs-inference-prefill-heavy
251+ echo "Fetching ${NAMESPACE} pods log..."
252+ kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
253+ | xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1'
254+ echo "Fetching ${NAMESPACE} pods descriptions..."
255+ kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
256+ | xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1'
257+ mv ~/igw-prefill-heavy-deployment.log . || true
258+ mv ~/install-deps.log . || true
259+
260+ - name : Upload pod logs as artifact
261+ uses : actions/upload-artifact@v4
262+ if : always()
263+ with :
264+ name : igw-pod-logs-inference-prefill-heavy-${{ matrix.accelerator.name }}
265+ path : pod-logs-inference-prefill-heavy
266+
267+ - name : Send Google Chat notification on failure
268+ if : failure()
269+ uses : SimonScholz/google-chat-action@3b3519e5102dba8aa5046fd711c4b553586409bb
270+ with :
271+ webhookUrl : ${{ secrets.GOOGLE_CHAT_WEBHOOK }}
272+ jobStatus : ${{ job.status }}
273+ title : ' ${{ github.workflow }} - ${{ matrix.accelerator.name }}'
274+
275+ - name : Cleanup deployment
276+ if : always()
277+ run : |
278+ GATEWAY_NAME=inference-gateway
279+ helm uninstall vllm-llama3-8b-instruct -n ${NAMESPACE} --ignore-not-found
280+ helm uninstall prefill-heavy-benchmark -n ${NAMESPACE} --ignore-not-found
281+ kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
282+ kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
0 commit comments