diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index ac43b597d15..c7dea530989 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -225,7 +225,7 @@ steps: - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100" timeout_in_minutes: 120 commands: - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_bagel_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -260,10 +260,48 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Bagel Function Test with H100" + timeout_in_minutes: 120 + commands: + - pytest -s -v tests/e2e/online_serving/test_bagel_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" timeout_in_minutes: 60 commands: - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_bagel_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: