From 974151e5db389a9c76e3e080b9d5fcca8327ae3e Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Tue, 2 Dec 2025 10:34:41 +0000
Subject: [PATCH] fix: prevent HuggingFace access when SGLANG_USE_MODELSCOPE is
 enabled

Signed-off-by: Kay Yan <kay.yan@daocloud.io>
---
 .github/workflows/pr-test-npu.yml             |  2 +-
 python/sglang/srt/configs/model_config.py     |  9 ++++++--
 .../sglang/srt/utils/hf_transformers_utils.py | 21 ++++++++++++++-----
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml
index af19e7549531..3e8d7c3e8d4f 100644
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
@@ -70,7 +70,7 @@ jobs:
         env:
           SGLANG_USE_MODELSCOPE: true
           SGLANG_IS_IN_CI: true
-          HF_ENDPOINT: https://hf-mirror.com
+          HF_ENDPOINT: https://127.0.0.1 # Set to an inaccessible Huggingface endpoint
           TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
           PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
           STREAMS_PER_DEVICE: 32
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 26dfbe5eb1d5..ef76ba350fd9 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -25,7 +25,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import is_hip, retry
+from sglang.srt.utils import get_bool_env_var, is_hip, retry
 from sglang.srt.utils.hf_transformers_utils import (
     get_config,
     get_context_length,
@@ -521,9 +521,14 @@ def _parse_quant_hf_config(self):
                 import huggingface_hub
 
                 try:
-                    from huggingface_hub import HfApi, hf_hub_download
 
+                    # Conditional import based on SGLANG_USE_MODELSCOPE environment variable
+                    if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
+                        from modelscope import hf_hub_download
+                    else:
+                        from huggingface_hub import HfApi, hf_hub_download
                     hf_api = HfApi()
+
                     # Retry HF API call up to 3 times
                     file_exists = retry(
                         lambda: hf_api.file_exists(
diff --git a/python/sglang/srt/utils/hf_transformers_utils.py b/python/sglang/srt/utils/hf_transformers_utils.py
index 0e71dfb31383..335267d9f747 100644
--- a/python/sglang/srt/utils/hf_transformers_utils.py
+++ b/python/sglang/srt/utils/hf_transformers_utils.py
@@ -23,12 +23,23 @@
 from typing import Any, Dict, List, Optional, Type, Union
 
 import torch
-from huggingface_hub import snapshot_download
+
+from sglang.srt.utils import get_bool_env_var
+
+# Conditional import based on SGLANG_USE_MODELSCOPE environment variable
+if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
+    from modelscope import (
+        AutoConfig,
+        AutoProcessor,
+        AutoTokenizer,
+        GenerationConfig,
+        snapshot_download,
+    )
+else:
+    from huggingface_hub import snapshot_download
+    from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
+
 from transformers import (
-    AutoConfig,
-    AutoProcessor,
-    AutoTokenizer,
-    GenerationConfig,
     PretrainedConfig,
     PreTrainedTokenizer,
     PreTrainedTokenizerBase,