Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions vllm_ascend/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,14 @@
"ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
# The path for HCCL library, it's used by pyhccl communicator backend. If
# not set, the default value is libhccl.so.
"HCCL_SO_PATH": lambda: os.environ.get("HCCL_SO_PATH", None),
"HCCL_SO_PATH": lambda: os.getenv("HCCL_SO_PATH", None),
# The version of vllm is installed. This value is used for developers who
# installed vllm from source locally. In this case, the version of vllm is
# usually changed. For example, if the version of vllm is "0.9.0", but when
# it's installed from source, the version of vllm is usually set to "0.9.1".
# In this case, developers need to set this value to "0.9.0" to make sure
# that the correct package is installed.
"VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None),
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf
# training, the optimized model may not be suitable. In this case, set this
# value to False to disable the optimized model.
"USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv("USE_OPTIMIZED_MODEL", "1"))),
# Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
# this feature is supported in A2, and eager mode will get better performance.
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", "0"))),
Expand Down
4 changes: 1 addition & 3 deletions vllm_ascend/ops/mm_encoder_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import torch_npu
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore

import vllm_ascend.envs as envs_ascend

MIN_PAD_SIZE = 64 # min_size to pad weight
MAX_PAD_SIZE = 128 # max_size to pad weight

Expand Down Expand Up @@ -93,7 +91,7 @@ def forward_oot(
# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)

enable_pad = envs_ascend.USE_OPTIMIZED_MODEL and self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE

if enable_pad:
origin_shape = q.shape[-1]
Expand Down
Loading