diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 05e4131bf48..9fafb2ed18c 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -58,7 +58,7 @@ "ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None), # The path for HCCL library, it's used by pyhccl communicator backend. If # not set, the default value is libhccl.so. - "HCCL_SO_PATH": lambda: os.environ.get("HCCL_SO_PATH", None), + "HCCL_SO_PATH": lambda: os.getenv("HCCL_SO_PATH", None), # The version of vllm is installed. This value is used for developers who # installed vllm from source locally. In this case, the version of vllm is # usually changed. For example, if the version of vllm is "0.9.0", but when @@ -66,10 +66,6 @@ # In this case, developers need to set this value to "0.9.0" to make sure # that the correct package is installed. "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None), - # Some models are optimized by vllm ascend. While in some case, e.g. rlhf - # training, the optimized model may not be suitable. In this case, set this - # value to False to disable the optimized model. - "USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv("USE_OPTIMIZED_MODEL", "1"))), # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled. # this feature is supported in A2, and eager mode will get better performance. "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", "0"))), diff --git a/vllm_ascend/ops/mm_encoder_attention.py b/vllm_ascend/ops/mm_encoder_attention.py index 7beb7b507a6..85388db023e 100644 --- a/vllm_ascend/ops/mm_encoder_attention.py +++ b/vllm_ascend/ops/mm_encoder_attention.py @@ -21,8 +21,6 @@ import torch_npu from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore -import vllm_ascend.envs as envs_ascend - MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -93,7 +91,7 @@ def forward_oot( # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim] q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len) - enable_pad = envs_ascend.USE_OPTIMIZED_MODEL and self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE + enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE if enable_pad: origin_shape = q.shape[-1]