[WIP] Add e2e test for tune
api with LLM hyperparameter optimization
#270
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test with tune API | |
on: | |
pull_request: | |
paths-ignore: | |
- "pkg/ui/v1beta1/frontend/**" | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: ubuntu-22.04 | |
timeout-minutes: 120 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Test Env | |
uses: ./.github/workflows/template-setup-e2e-test | |
with: | |
kubernetes-version: ${{ matrix.kubernetes-version }} | |
- name: Install Training Operator SDK | |
shell: bash | |
run: | | |
pip install "kubeflow-training[huggingface]==1.8.1" | |
# Step 2: Check Disk Space Before Test | |
- name: Check Disk Space Before Test | |
run: | | |
docker system prune -a | |
docker volume prune | |
echo "Checking disk space usage before e2e test..." | |
df -h # Run 'df' to check free disk space | |
- name: Monitor Memory Usage Before Run | |
if: always() | |
run: free -h | |
- name: Monitor Docker Container Memory Usage | |
if: always() | |
run: | | |
docker stats --no-stream | |
- name: Restart Docker Service | |
run: | | |
echo "Restarting Docker service..." | |
sudo systemctl restart docker | |
echo "Docker service status:" | |
sudo systemctl --no-pager -l -o short status docker | |
kubectl get pods -n kubeflow | |
# Step 3: Run e2e test with tune API | |
- name: Run e2e test with tune API | |
if: always() | |
uses: ./.github/workflows/template-e2e-test | |
with: | |
tune-api: true | |
training-operator: true | |
- name: Monitor Memory Usage After Run | |
if: always() | |
run: free -h | |
- name: Monitor Docker Container Memory Usage | |
if: always() | |
run: | | |
docker stats --no-stream | |
# Step 4: Check Disk Space After Test | |
- name: Check Disk Space After Test | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking disk space usage after e2e test..." | |
df -h # Run 'df' to check free disk space | |
# Step 5: Fetch Pod Logs for Relevant Experiment Pod | |
- name: Fetch Experiment Pod Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching all the pods in the default namespace..." | |
kubectl get pods -n default | |
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) | |
echo "Fetching pod description for experiment pod..." | |
kubectl describe pod $POD_NAME -n default | |
echo "Fetching logs for experiment pod..." | |
kubectl logs $POD_NAME -n default --all-containers | |
echo "Fetching events for experiment pod..." | |
kubectl get events -n default | grep "tune-example-2" | |
# Step 6: Fetch kubelet logs (requires sudo for accessing kubelet logs) | |
- name: Fetch Kubelet Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching kubelet logs..." | |
sudo journalctl -u kubelet | |
- name: Check container runtime logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking container runtime logs..." | |
sudo journalctl -u docker | |
strategy: | |
fail-fast: false | |
matrix: | |
# Kubernetes versions to test with | |
kubernetes-version: ["v1.29.2"] |