Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ jobs:
- name: Run test
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test ci_sglang python3 run_suite.py
docker exec -w /sglang-checkout/test ci_sglang python3 run_suite.py --hw cuda --suite stage-a-test-1

unit-test-backend-1-gpu-amd:
needs: [check-changes, stage-a-test-1-amd]
Expand Down
17 changes: 9 additions & 8 deletions scripts/ci/amd_ci_install_dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,38 +12,39 @@ else
fi

# Install the required dependencies in CI.
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang chown -R root:root /sgl-data/pip-cache
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"

case "${GPU_ARCH}" in
mi35x)
echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
docker exec ci_sglang rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
docker exec ci_sglang pip install -e "python[dev_hip]" --no-deps # TODO: only for mi35x
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[dev_hip]" --no-deps # TODO: only for mi35x
# For lmms_evals evaluating MMMU
docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
docker exec -w /lmms-eval ci_sglang pip install -e . --no-deps # TODO: only for mi35x
docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . --no-deps # TODO: only for mi35x
;;
mi30x|mi300|mi325)
echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
docker exec ci_sglang rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[dev_hip]"
# For lmms_evals evaluating MMMU
docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
docker exec -w /lmms-eval ci_sglang pip install -e .
docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
;;
*)
echo "Runner architecture '${GPU_ARCH}' unrecognised;" >&2
;;
esac

docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .

docker exec -w / ci_sglang mkdir -p /dummy-grok
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
docker cp ./dummy-grok ci_sglang:/

docker exec ci_sglang pip install huggingface_hub[hf_xet]
docker exec ci_sglang pip install pytest
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet]
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest
24 changes: 17 additions & 7 deletions scripts/ci/amd_ci_start_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
SGLANG_VERSION="v0.5.5" # Default version, will be overridden if version.py is found

TMP_VERSION_FILE=$(mktemp)
if git fetch origin main --quiet; then
if git fetch --depth=1 origin main; then
if git show origin/main:python/sglang/version.py >"$TMP_VERSION_FILE" 2>/dev/null; then
VERSION_FROM_FILE="v$(cat "$SGLANG_VERSION_FILE" | cut -d'"' -f2)"
if [ -n "$VERSION_FROM_FILE" ]; then
Expand All @@ -25,8 +25,9 @@ rm -f "$TMP_VERSION_FILE"


# Default base tags (can be overridden by command line arguments)
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
ROCM_VERSION="rocm700"
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"

# Parse command line arguments
MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
Expand Down Expand Up @@ -116,6 +117,15 @@ find_latest_image() {
fi
done

echo "No recent images found. Searching any cached local images matching ROCm+arch…" >&2
local any_local
any_local=$(docker images --format '{{.Repository}}:{{.Tag}}' --filter "reference=rocm/sgl-dev:*${ROCM_VERSION}*${gpu_arch}*" | sort -r | head -n 1)
if [[ -n "$any_local" ]]; then
echo "Using cached fallback image: ${any_local}" >&2
echo "${any_local}"
return 0
fi

echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
echo "Using hard-coded fallback…" >&2
if [[ "${gpu_arch}" == "mi35x" ]]; then
Expand All @@ -130,9 +140,9 @@ IMAGE=$(find_latest_image "${GPU_ARCH}")
echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}"

HF_CACHE_HOST=/home/runner/sgl-data/hf-cache
if [[ -d "$HF_CACHE_HOST" ]]; then
CACHE_VOLUME="-v $HF_CACHE_HOST:/hf_home"
CACHE_HOST=/home/runner/sgl-data
if [[ -d "$CACHE_HOST" ]]; then
CACHE_VOLUME="-v $CACHE_HOST:/sgl-data"
else
CACHE_VOLUME=""
fi
Expand All @@ -145,7 +155,7 @@ docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
--shm-size 32g \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
-e HF_HOME=/hf_home \
-e HF_HOME=/sgl-data/hf-cache \
--security-opt seccomp=unconfined \
-w /sglang-checkout \
--name ci_sglang \
Expand Down
2 changes: 1 addition & 1 deletion test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@
# TestFile("hicache/test_hicache.py", 116), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/12575
# TestFile("hicache/test_hicache_mla.py", 127), # Disabled temporarily, # Temporarily disabled, see https://github.com/sgl-project/sglang/issues/12574
# TestFile("hicache/test_hicache_storage.py", 127), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/12575
TestFile("lora/test_lora.py", 150),
TestFile("lora/test_lora.py", 665),
# TestFile("lora/test_lora_backend.py", 99), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
# TestFile("lora/test_lora_cuda_graph.py", 250), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107
TestFile("lora/test_lora_eviction.py", 240),
Expand Down
Loading