vllm-project · wangxiyuan · Feb 9, 2026 · Feb 9, 2026
@@ -58,18 +58,14 @@
     "ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
     # The path for HCCL library, it's used by pyhccl communicator backend. If
     # not set, the default value is libhccl.so.
-    "HCCL_SO_PATH": lambda: os.environ.get("HCCL_SO_PATH", None),
+    "HCCL_SO_PATH": lambda: os.getenv("HCCL_SO_PATH", None),
     # The version of vllm is installed. This value is used for developers who
     # installed vllm from source locally. In this case, the version of vllm is
     # usually changed. For example, if the version of vllm is "0.9.0", but when
     # it's installed from source, the version of vllm is usually set to "0.9.1".
     # In this case, developers need to set this value to "0.9.0" to make sure
     # that the correct package is installed.
     "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None),
-    # Some models are optimized by vllm ascend. While in some case, e.g. rlhf
-    # training, the optimized model may not be suitable. In this case, set this
-    # value to False to disable the optimized model.
-    "USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv("USE_OPTIMIZED_MODEL", "1"))),
     # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
     # this feature is supported in A2, and eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", "0"))),

@@ -21,8 +21,6 @@
 import torch_npu
 from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention  # type: ignore
 
-import vllm_ascend.envs as envs_ascend
-
 MIN_PAD_SIZE = 64  # min_size to pad weight
 MAX_PAD_SIZE = 128  # max_size to pad weight
 
@@ -93,7 +91,7 @@ def forward_oot(
         # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
         q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
 
-        enable_pad = envs_ascend.USE_OPTIMIZED_MODEL and self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
+        enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
 
         if enable_pad:
             origin_shape = q.shape[-1]