[WIP] Add e2e test for tune
api with LLM hyperparameter optimization
#256
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test with tune API | |
on: | |
pull_request: | |
paths-ignore: | |
- "pkg/ui/v1beta1/frontend/**" | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: ubuntu-22.04 | |
timeout-minutes: 120 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Test Env | |
uses: ./.github/workflows/template-setup-e2e-test | |
with: | |
kubernetes-version: ${{ matrix.kubernetes-version }} | |
- name: Install Training Operator SDK | |
shell: bash | |
run: | | |
pip install "kubeflow-training[huggingface]==1.8.1" | |
# Step 2: Check Disk Space Before Test | |
- name: Check Disk Space Before Test | |
run: | | |
echo "Checking disk space usage before e2e test..." | |
df -h # Run 'df' to check free disk space | |
# Step 3: Run e2e test with tune API | |
- name: Run e2e test with tune API | |
uses: ./.github/workflows/template-e2e-test | |
with: | |
tune-api: true | |
training-operator: true | |
# Step 4: Check Disk Space After Test | |
- name: Check Disk Space After Test | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking disk space usage after e2e test..." | |
df -h # Run 'df' to check free disk space | |
# Step 5: Fetch Pod Logs for Relevant Experiment Pod | |
- name: Fetch Experiment Pod Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching logs for experiment pod..." | |
kubectl get pods -n default | |
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) | |
kubectl describe pod $POD_NAME -n default | |
kubectl get events -n default | grep "tune-example-2" | |
# Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs) | |
- name: Fetch Kubelet Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching kubelet logs..." | |
sudo journalctl -u kubelet | |
- name: Check container runtime logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking container runtime logs..." | |
sudo journalctl -u docker | |
strategy: | |
fail-fast: false | |
matrix: | |
# Kubernetes versions to test with | |
kubernetes-version: ["v1.29.2"] |