diff --git a/.github/workflows/flash_attention.yml b/.github/workflows/flash_attention.yml
index c6fc6635..201aaff1 100644
--- a/.github/workflows/flash_attention.yml
+++ b/.github/workflows/flash_attention.yml
@@ -60,7 +60,7 @@ jobs:
pushd fa4
python setup.py install
pip install -e flash_attn/cute/
-
+
echo '
B200' >> /tmp/workspace/fa4_output.txt
nvidia-smi -q -d POWER | grep 'Current Power Limit' | head -1 | cut -d : -f 2 >> /tmp/workspace/fa4_output.txt
echo '
' >> /tmp/workspace/fa4_output.txt
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index ef9cdc1c..f292c5c5 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -327,9 +327,17 @@ jobs:
"${DOCKER_IMAGE}"
)
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
- docker exec -t "${container_name}" bash -c "pip install torchao==0.14.1 fbgemm-gpu-genai==1.4.1"
+ docker exec -t "${container_name}" bash -c "
+ pip install torchao==0.14.1 fbgemm-gpu-genai==1.4.1
+
+ # A quick mitigation for https://github.com/vllm-project/vllm/issues/32373
+ rm /etc/ld.so.conf.d/00-cuda-compat.conf || true
+ ldconfig
+ "
fi
- docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh"
+ docker exec -t "${container_name}" bash -c "
+ cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+ "
- name: Authenticate with AWS
# AWS CUDA runners already have access to the bucket via its runner IAM role