Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/labled_doctest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ jobs:
# Each version should be tested
fail-fast: false
matrix:
vllm_verison: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler]
vllm_version: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler]
name: vLLM Ascend test
runs-on: linux-aarch64-a2b3-1
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_version }}
steps:
- name: Check NPU/CANN and git info
run: |
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/schedule_nightly_test_a2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,10 @@ jobs:
# Each version should be tested
fail-fast: false
matrix:
vllm_verison: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler]
vllm_version: [releases-v0.13.0, releases-v0.13.0-openeuler, main, main-openeuler]
runs-on: linux-aarch64-a2b3-1
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_version }}
steps:
- name: Check NPU/CANN and git info
run: |
Expand Down Expand Up @@ -183,4 +183,4 @@ jobs:

# Run real test
echo "Test:"
/vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh
/vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
default_install_hook_types:
- pre-commit
- commit-msg

default_stages:
- pre-commit # Run locally
- manual # Run in CI

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.0
hooks:
- id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format

- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
Expand All @@ -22,6 +25,7 @@ repos:
]
additional_dependencies:
- tomli

- repo: https://github.com/crate-ci/typos
rev: v1.32.0
hooks:
Expand All @@ -30,24 +34,28 @@ repos:
"--force-exclude",
"--exclude", "csrc/**"
]

# - repo: https://github.com/pre-commit/mirrors-clang-format
# rev: v20.1.3
# hooks:
# - id: clang-format
# files: ^csrc/.*\.(cpp|hpp|cc|hh|cxx|hxx)$
# types_or: [c++]
# args: [--style=google, --verbose]

- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.45.0
hooks:
- id: markdownlint
exclude: '.*\.inc\.md$|.*report_template\.md$|.*contributors\.md$|.*PULL_REQUEST_TEMPLATE\.md$'
stages: [manual] # Only run in CI

- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
exclude: '.*\.github/workflows/scripts/.*\.ya?ml$'

- repo: local
hooks:
- id: png-lint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@ msgid "Adding a New Multi-Modal Model"
msgstr "添加新的多模态模型"

#: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:3
msgid "**_Comming soon ..._**"
msgid "**_Coming soon ..._**"
msgstr "**_敬请期待 ..._**"
2 changes: 1 addition & 1 deletion docs/source/user_guide/release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ This is the 3rd release candidate of v0.9.1 for vLLM Ascend. Please follow the [
- Fix incorrect req block length in ascend scheduler [#2394](https://github.com/vllm-project/vllm-ascend/pull/2394)
- Fix header include issue in rope [#2398](https://github.com/vllm-project/vllm-ascend/pull/2398)
- Fix mtp config bug [#2412](https://github.com/vllm-project/vllm-ascend/pull/2412)
- Fix error info and adapt `attn_metedata` refactor [#2402](https://github.com/vllm-project/vllm-ascend/pull/2402)
- Fix error info and adapt `attn_metadata` refactor [#2402](https://github.com/vllm-project/vllm-ascend/pull/2402)
- Fix torchair runtime error caused by configuration mismatches and `.kv_cache_bytes` file missing [#2312](https://github.com/vllm-project/vllm-ascend/pull/2312)
- Move `with_prefill` allreduce from cpu to npu [#2230](https://github.com/vllm-project/vllm-ascend/pull/2230)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def abort_prefiller_request(self, server_idx: int, request_id): # Changed to sy
# No lock needed - atomic operation
self.prefillers[server_idx].aborted_requests.add(request_id)

def aquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous
def acquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous
"""
Get the set of aborted requests and clear it.
This is used to release kv cache in prefiller node.
Expand Down Expand Up @@ -325,7 +325,7 @@ async def send_request_to_service(
max_retries: int = 3,
base_delay: float = 0.2,
):
proxy_state.aquire_aborted_prefiller_requests(prefiller_id)
proxy_state.acquire_aborted_prefiller_requests(prefiller_id)
req_data = req_data.copy()
req_data["stream"] = False
req_data["max_tokens"] = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def abort_prefiller_request(self, server_idx: int, request_id): # Changed to sy
return
self.prefillers[server_idx].aborted_requests.add(request_id)

def aquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous
def acquire_aborted_prefiller_requests(self, server_idx: int): # Changed to synchronous
"""
Get the set of aborted requests and clear it.
This is used to release kv cache in prefiller node.
Expand Down Expand Up @@ -582,7 +582,7 @@ async def send_request_to_service(
max_retries: int = 3,
base_delay: float = 0.2,
):
aborted_requests = proxy_state.aquire_aborted_prefiller_requests(prefiller_id)
aborted_requests = proxy_state.acquire_aborted_prefiller_requests(prefiller_id)
req_data = req_data.copy()
req_data["kv_transfer_params"] = {
"do_remote_decode": True,
Expand Down
4 changes: 2 additions & 2 deletions examples/eplb/eplb_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def calculate_average(lst):
return total / count


def layer_imblance_polt(y_list, label_names, device_num, output_path, file_name):
def layer_imbalance_plot(y_list, label_names, device_num, output_path, file_name):
plt.rcParams["font.sans-serif"] = ["Arial"]
plt.rcParams["axes.unicode_minus"] = False
x = [i for i in range(58)]
Expand Down Expand Up @@ -160,4 +160,4 @@ def deepseek_deploy(workload, num_redundancy_expert, num_groups, num_nodes, num_
save_matrix_to_json(output_path, file_name, np.array(global_deployment))
label_names = ["default deployment max load", "balanced load max load", "balanced load avg load"]
new_file_name = f"{exp_name}_{num_devices}_{num_redundancy_expert}.png"
layer_imblance_polt(y_list, label_names, num_devices, output_path, new_file_name)
layer_imbalance_plot(y_list, label_names, num_devices, output_path, new_file_name)
6 changes: 3 additions & 3 deletions examples/external_online_dp/dp_load_balance_proxy_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,10 @@ async def _select_instance(api: str, req_data: Any, request_length: int):
request_id = await proxy_state.next_req_id()
# Select dp server based on priority score
server_idx = proxy_state.select_server(priority_score)
choosen_server = proxy_state.dp_servers[server_idx]
logger.debug(f"Choose server {choosen_server.url} to process request {request_id}")
chosen_server = proxy_state.dp_servers[server_idx]
logger.debug(f"Choose server {chosen_server.url} to process request {request_id}")
return InstanceInfo(
request_id=request_id, server_idx=server_idx, priority_score=priority_score, server_state=choosen_server
request_id=request_id, server_idx=server_idx, priority_score=priority_score, server_state=chosen_server
)


Expand Down
8 changes: 4 additions & 4 deletions examples/external_online_dp/launch_online_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ def parse_args():
vllm_start_port = args.vllm_start_port


def run_command(visiable_devices, dp_rank, vllm_engine_port):
def run_command(visible_devices, dp_rank, vllm_engine_port):
command = [
"bash",
"./run_dp_template.sh",
visiable_devices,
visible_devices,
str(vllm_engine_port),
str(dp_size),
str(dp_rank),
Expand All @@ -55,8 +55,8 @@ def run_command(visiable_devices, dp_rank, vllm_engine_port):
for i in range(dp_size_local):
dp_rank = dp_rank_start + i
vllm_engine_port = vllm_start_port + i
visiable_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size))
process = multiprocessing.Process(target=run_command, args=(visiable_devices, dp_rank, vllm_engine_port))
visible_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size))
process = multiprocessing.Process(target=run_command, args=(visible_devices, dp_rank, vllm_engine_port))
processes.append(process)
process.start()

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def run(self):
try:
print("Running bash build_aclnn.sh ...")
subprocess.check_call(["bash", "csrc/build_aclnn.sh", ROOT_DIR, envs.SOC_VERSION])
print("buid_aclnn.sh executed successfully!")
print("build_aclnn.sh executed successfully!")
except subprocess.CalledProcessError as e:
print(f"Error running build_aclnn.sh: {e}")
raise SystemExit(e.returncode)
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/nightly/multi_node/scripts/multi_node_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# isort: off
from tests.e2e.nightly.multi_node.scripts.utils import (
CONFIG_BASE_PATH, DEFAULT_SERVER_PORT, get_all_ipv4, get_cluster_ips,
get_net_interface, setup_logger, get_avaliable_port)
get_net_interface, setup_logger, get_available_port)
# isort: on
setup_logger()
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -202,7 +202,7 @@ def __init__(
master_ip = (self.disagg_cfg.master_ip_for_node(
self.cur_index, self.nodes)
if self.disagg_cfg else self.nodes[0].ip)
self.proxy_port = get_avaliable_port()
self.proxy_port = get_available_port()

self.envs = DistEnvBuilder(
cur_node=self.cur_node,
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/nightly/multi_node/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def get_cluster_ips(word_size: int = 2) -> list[str]:
return [resolver(dns) for dns in get_cluster_dns_list(word_size)]


def get_avaliable_port(start_port: int = 6000, end_port: int = 7000) -> int:
def get_available_port(start_port: int = 6000, end_port: int = 7000) -> int:
import socket
for port in range(start_port, end_port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
enable_custom_op()


class TestDisptachFFNCombine:
class TestDispatchFFNCombine:

def __init__(self, rank, world_size, port):
self.rank = rank
Expand Down Expand Up @@ -208,7 +208,7 @@ def generate_random_tensor(self, size, dtype):


def worker(rank: int, world_size: int, port: int, q: mp.SimpleQueue):
op = TestDisptachFFNCombine(rank, world_size, port)
op = TestDispatchFFNCombine(rank, world_size, port)
op.generate_hcom()
out1 = op.run_tensor_list()
q.put(out1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
enable_custom_op()


class TestDisptachFFNCombine:
class TestDispatchFFNCombine:

def __init__(self, rank, world_size, port):
self.rank = rank
Expand Down Expand Up @@ -208,7 +208,7 @@ def generate_random_tensor(self, size, dtype):


def worker(rank: int, world_size: int, port: int, q: mp.SimpleQueue):
op = TestDisptachFFNCombine(rank, world_size, port)
op = TestDispatchFFNCombine(rank, world_size, port)
op.generate_hcom()
out1 = op.run_tensor_list()
q.put(out1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,10 @@ def create_test_data(

logits = torch.randn(num_reqs, vocab_size, device=device, dtype=dtype)

repetiton_penalty = torch.ones(num_reqs, device=device, dtype=torch.float32)
repetition_penalty = torch.ones(num_reqs, device=device, dtype=torch.float32)
for i in range(num_reqs):
if torch.rand(1) > 0.3:
repetiton_penalty[i] = torch.rand(1, device=device).item() * 0.8 + 0.6
repetition_penalty[i] = torch.rand(1, device=device).item() * 0.8 + 0.6

frequency_penalty = torch.zeros(num_reqs, device=device, dtype=torch.float32)
for i in range(num_reqs):
Expand Down Expand Up @@ -168,7 +168,7 @@ def create_test_data(
output_bin_counts[state_idx, token] = count

sampling_metadata = SamplingMetadata(
repetition_penalty=repetiton_penalty,
repetition_penalty=repetition_penalty,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
temperature=temperature,
Expand Down Expand Up @@ -217,4 +217,3 @@ def test_apply_penalties_and_temperature(
atol = 1e-02
rtol = 1e-02
assert torch.allclose(logits_triton, logits_pytorch_result, atol=atol, rtol=rtol)

4 changes: 2 additions & 2 deletions tests/ut/ops/test_moe_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def setUpClass(cls):
}

support_combine = [(0, 0), (1, 0), (0, 1)]
unsupport_combine = [(0, 2), (2, 1), (1, 2)]
unsupported_combine = [(0, 2), (2, 1), (1, 2)]

def test_cumsum_group_list_supported_conversion(self):
for src_list_type, dst_list_type in self.support_combine:
Expand All @@ -38,7 +38,7 @@ def test_cumsum_group_list_invalid_type_valueerror(self):

def test_cumsum_group_list_unsupported_conversion_notimplementederror(
self):
for src_list_type, dst_list_type in self.unsupport_combine:
for src_list_type, dst_list_type in self.unsupported_combine:
with self.subTest(src=src_list_type, dst=dst_list_type):
with self.assertRaises(NotImplementedError) as excinfo:
cumsum_group_list(self.glist_dict[0], src_list_type,
Expand Down
6 changes: 3 additions & 3 deletions typos.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[files]
# these files may be written in non english words
extend-exclude = []
extend-exclude = [".pre-commit-config.yaml",]
ignore-hidden = true
ignore-files = true
ignore-dot = true
Expand All @@ -17,9 +17,9 @@ ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = [".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
".*UE8M0.*", ".*(UE4M3|ue4m3).*", ".*eles.*", ".*fo.*", ".*ba.*",
".*ot.*", ".*[Tt]h[rR].*"]
extend-ignore-words-re = ["CANN", "cann","ND","alog"]
extend-ignore-words-re = ["CANN", "cann","ND","alog","nd","BA","datas","ful","udo",]
extend-ignore-re = []

[default.extend-identifiers]
Expand Down
6 changes: 3 additions & 3 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,14 @@ def _construct_weight_prefetch_config(self, additional_config):
if os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", "0") == "1":
MAX_PREFETCH_WEIGHT_SIZE: int = 18 * 1024 * 1024
gate_up_prefetch_size = int(os.getenv("VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE))
down_prefetch_szie = int(os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE))
down_prefetch_size = int(os.getenv("VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE", MAX_PREFETCH_WEIGHT_SIZE))
self.weight_prefetch_config.set_mlp_pre_version_compatibale_config(
gate_up_prefetch_size, down_prefetch_szie
gate_up_prefetch_size, down_prefetch_size
)
logger.info_once(
f"MLP weight prefetch enabled from env variable VLLM_ASCEND_ENABLE_PREFETCH_MLP."
f"gate_up_prefetch_size={gate_up_prefetch_size}, "
f"down_prefetch_szie={down_prefetch_szie}."
f"down_prefetch_size={down_prefetch_size}."
)
warnings.warn(
"VLLM_ASCEND_ENABLE_PREFETCH_MLP is deprecated and will be removed in a v0.16.0 version. "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass

# computation-communication tiling block is 512
ALLREDUCE_NORM_FUSE_THREHOLD = 512
ALLREDUCE_NORM_FUSE_THRESHOLD = 512


def get_compile_range_and_extra_stream_check():
def check_func(match: Match) -> bool:
compile_range = get_pass_context().compile_range
return extra_stream_scope_check(match) and compile_range.start > ALLREDUCE_NORM_FUSE_THREHOLD
return extra_stream_scope_check(match) and compile_range.start > ALLREDUCE_NORM_FUSE_THRESHOLD

return check_func

Expand Down Expand Up @@ -176,5 +176,5 @@ def is_applicable_for_range(self, compile_range: Range) -> bool:
"""
Check if the pass is applicable for the current configuration.
"""
applicable = compile_range.start > ALLREDUCE_NORM_FUSE_THREHOLD
applicable = compile_range.start > ALLREDUCE_NORM_FUSE_THRESHOLD
return applicable
Loading
Loading