Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions docs/source/user_guide/configuration/additional_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@ The following table lists additional configuration options available in vLLM Asc
| Name | Type | Default | Description |
|-------------------------------------|------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------|
| `xlite_graph_config` | dict | `{}` | Configuration options for xlite graph mode |
| `finegrained_tp_config` | dict | `{}` | Configuration options for module tensor parallelism |
| `weight_prefetch_config` | dict | `{}` | Configuration options for weight prefetch |
| `refresh` | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case. |
| `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. |
| `kv_cache_dtype` | str | `None` | When using the KV cache quantization method, KV cache dtype needs to be set, currently only int8 is supported. |
| `enable_shared_expert_dp` | bool | `False` | When the expert is shared in DP, it delivers better performance but consumes more memory. Currently only DeepSeek series models are supported. |
| `lmhead_tensor_parallel_size` | int | `None` | The custom tensor parallel size of lmhead. |
| `oproj_tensor_parallel_size` | int | `None` | The custom tensor parallel size of oproj. |
| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multistream shared expert. This option only takes effect on MoE models with shared experts. |
| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multistream shared expert. This option only takes effects on MoE models with shared experts. |
| `dynamic_eplb` | bool | `False` | Whether to enable dynamic EPLB. |
| `num_iterations_eplb_update` | int | `400` | Forward iterations when EPLB begins. |
| `gate_eplb` | bool | `False` | Whether to enable EPLB only once. |
Expand All @@ -58,6 +57,15 @@ The details of each configuration option are as follows:
| `enabled` | bool | `False` | Whether to enable weight prefetch. |
| `prefetch_ratio` | dict | `{"attn": {"qkv": 1.0, "o": 1.0}, "moe": {"gate_up": 0.8}}` | Prefetch ratio of each weight. |

**finegrained_tp_config**

| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `lmhead_tensor_parallel_size` | int | `0` | The custom tensor parallel size of lmhead. |
| `oproj_tensor_parallel_size` | int | `0` | The custom tensor parallel size of oproj. |
| `embedding_tensor_parallel_size` | int | `0` | The custom tensor parallel size of embedding. |
| `mlp_tensor_parallel_size` | int | `0` | The custom tensor parallel size of mlp. |

### Example

An example of additional configuration is as follows:
Expand All @@ -76,6 +84,12 @@ An example of additional configuration is as follows:
}
},
},
"finegrained_tp_config": {
"lmhead_tensor_parallel_size": 8,
"oproj_tensor_parallel_size": 8,
"embedding_tensor_parallel_size": 8,
"mlp_tensor_parallel_size": 8,
},
"multistream_overlap_shared_expert": True,
"refresh": False,
}
Expand Down
15 changes: 9 additions & 6 deletions tests/ut/distributed/test_parallel_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@

@pytest.fixture
def parallel_config():
return ParallelConfig(data_parallel_size=2,
tensor_parallel_size=2,
pipeline_parallel_size=2)
return ParallelConfig(
data_parallel_size=2,
tensor_parallel_size=4,
pipeline_parallel_size=2,
)


@pytest.fixture
def mock_distributed():
with patch('torch.distributed.is_initialized', return_value=True), \
patch('torch.distributed.get_world_size', return_value=8), \
patch('torch.distributed.get_world_size', return_value=16), \
patch('torch.distributed.get_backend', return_value='nccl'), \
patch('vllm_ascend.distributed.parallel_state.get_world_group') as mock_group, \
patch('vllm_ascend.distributed.parallel_state.get_tp_group') as mock_tp_group, \
Expand All @@ -36,8 +38,9 @@ def mock_distributed():

def test_init_ascend_model_parallel(mock_distributed, parallel_config):
mock_ascend_config = MagicMock()
mock_ascend_config.lmhead_tensor_parallel_size = 2
mock_ascend_config.oproj_tensor_parallel_size = 2
mock_ascend_config.finegrained_tp_config.lmhead_tensor_parallel_size = 2
mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
mock_ascend_config.finegrained_tp_config.embedding_tensor_parallel_size = 2
mock_ascend_config.flashcomm2_oproj_tensor_parallel_size = 2
mock_ascend_config.pd_tp_ratio = 2
mock_ascend_config.num_head_replica = 0
Expand Down
19 changes: 14 additions & 5 deletions tests/ut/ops/test_linear.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import unittest
from unittest import mock
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -26,7 +25,8 @@ def setUp(self):
parallel_state._OTP = self.mock_group

self.mock_ascend_config = MagicMock()
self.mock_ascend_config.oproj_tensor_parallel_size = 2
self.mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
self.mock_ascend_config.finegrained_tp_config.mlp_tensor_parallel_size = 2

self.patches = [
patch("vllm_ascend.ascend_config.get_ascend_config",
Expand Down Expand Up @@ -81,7 +81,11 @@ def test_process_weights_after_loading_disable_nz(self, mock_format_cast,
class TestAscendRowParallelLinear(BaseLinearTest):

def test_mlp_optimize(self):
os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1"

ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False

linear = AscendRowParallelLinear(
input_size=16,
Expand All @@ -98,8 +102,9 @@ def test_oproj_tp(self):
config._current_vllm_config = MagicMock()

ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
ascend_config._ASCEND_CONFIG.finegrained_tp_config.oproj_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False

linear = AscendRowParallelLinear(
input_size=16,
Expand All @@ -115,7 +120,11 @@ def test_oproj_tp(self):
class TestAscendMergedColumnParallelLinear(BaseLinearTest):

def test_merged_mlp_tp_init(self):
os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1"

ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False

linear = AscendMergedColumnParallelLinear(
input_size=16,
Expand Down
29 changes: 27 additions & 2 deletions tests/ut/ops/test_vocab_parallel_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
# Adapted from vllm/tests/lora/test_layers.py

import unittest
from unittest import mock
from unittest.mock import MagicMock, patch

import torch

from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.distributed import parallel_state
from vllm_ascend.ops.vocab_parallel_embedding import (
AscendLogitsProcessor, AscendParallelLMHead, AscendVocabParallelEmbedding)

Expand All @@ -32,9 +33,33 @@ def setUp(self):
self.embedding_dim = 10
self.org_num_embeddings = 40
self.padding_size = 8

self.mock_group = mock.MagicMock()
self.mock_group.world_size = 2
self.mock_group.rank_in_group = 0

parallel_state._MLP_TP = self.mock_group
parallel_state._OTP = self.mock_group

mock_vllm_config = MagicMock()
mock_vllm_config.additional_config = {}
init_ascend_config(mock_vllm_config)
self.mock_ascend_config = MagicMock()
self.mock_ascend_config.finegrained_tp_config.lmhead_tensor_parallel_size = 2
self.mock_ascend_config.finegrained_tp_config.embedding_tensor_parallel_size = 2

self.patches = [
patch("vllm_ascend.utils.get_ascend_config",
return_value=self.mock_ascend_config),
patch("vllm_ascend.distributed.parallel_state.get_lmhead_tp_group",
return_value=self.mock_group),
patch(
"vllm.distributed.parallel_state.get_tp_group",
return_value=self.mock_group,
),
]

for p in self.patches:
p.start()

def _create_layer(self):
# Patch methods and dependencies for VocabParallelEmbedding
Expand Down
88 changes: 60 additions & 28 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def __init__(self, vllm_config):
self.ascend_compilation_config = AscendCompilationConfig(
**ascend_compilation_config)

finegrained_tp_config = additional_config.get("finegrained_tp_config",
{})
self.finegrained_tp_config = FinegrainedTPConfig(
finegrained_tp_config, vllm_config)

# Dump / PrecisionDebugger configuration
dump_config_path = additional_config.get("dump_config", None)
self.dump_config = DumpConfig(dump_config_path)
Expand Down Expand Up @@ -103,34 +108,6 @@ def __init__(self, vllm_config):
"multistream_overlap_shared_expert", False)
self.recompute_scheduler_enable = additional_config.get(
"recompute_scheduler_enable", False)
self.lmhead_tensor_parallel_size = additional_config.get(
"lmhead_tensor_parallel_size", None)
if self.lmhead_tensor_parallel_size is not None:
logger.info(
f"Enable lmhead_tensor_parallel_size={self.lmhead_tensor_parallel_size} in pure DP scenario"
)
if vllm_config.parallel_config.tensor_parallel_size != 1:
raise AssertionError(
"lmhead_tensor_parallel_size is only supported in the pure DP scenario"
)
self.oproj_tensor_parallel_size = additional_config.get(
"oproj_tensor_parallel_size", None)
if self.oproj_tensor_parallel_size is not None:
logger.info(
f"Enable oproj_tensor_parallel_size={self.oproj_tensor_parallel_size} in pure DP scenario"
)
if vllm_config.parallel_config.tensor_parallel_size != 1:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in the pure DP scenario"
)
if vllm_config.model_config.enforce_eager is True:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in graph mode"
)
if vllm_config.kv_transfer_config is None or not vllm_config.kv_transfer_config.is_kv_consumer:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in pd scenario and can only be used in D node."
)
self.enable_cpu_binding = additional_config.get(
"enable_cpu_binding", False)

Expand Down Expand Up @@ -181,6 +158,61 @@ def __init__(self, vllm_config):
kv_cfg._engine_id_patched = True


class FinegrainedTPConfig:
"""
Configuration Object for finegrained_tp_config from additional_config
"""

def __init__(self, finegrained_tp_config: dict, vllm_config):
self.oproj_tensor_parallel_size = finegrained_tp_config.get(
"oproj_tensor_parallel_size", 0)
self.lmhead_tensor_parallel_size = finegrained_tp_config.get(
"lmhead_tensor_parallel_size", 0)
self.embedding_tensor_parallel_size = finegrained_tp_config.get(
"embedding_tensor_parallel_size", 0)
self.mlp_tensor_parallel_size = finegrained_tp_config.get(
"mlp_tensor_parallel_size", 0)

enabled_configs = []
if self.oproj_tensor_parallel_size > 0:
enabled_configs.append(
f"oproj_tensor_parallel_size={self.oproj_tensor_parallel_size}"
)
# dummy_run does not run the entire attention module in eager mode,, so the o_proj tp split can only be used in graph mode.
if vllm_config.model_config.enforce_eager is True:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in graph mode"
)
if vllm_config.kv_transfer_config is None or not vllm_config.kv_transfer_config.is_kv_consumer:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in pd scenario and can only be used in D node."
)
if self.lmhead_tensor_parallel_size > 0:
enabled_configs.append(
f"lmhead_tensor_parallel_size={self.lmhead_tensor_parallel_size}"
)
if self.embedding_tensor_parallel_size > 0:
enabled_configs.append(
f"embedding_tensor_parallel_size={self.embedding_tensor_parallel_size}"
)
if self.mlp_tensor_parallel_size > 0:
enabled_configs.append(
f"mlp_tensor_parallel_size={self.mlp_tensor_parallel_size}")
module_tp_sizes = [
self.oproj_tensor_parallel_size,
self.lmhead_tensor_parallel_size,
self.embedding_tensor_parallel_size,
self.mlp_tensor_parallel_size,
]
for module_tp_size in module_tp_sizes:
if module_tp_size > 0 and vllm_config.parallel_config.data_parallel_size % module_tp_size != 0:
raise AssertionError(
"module tp sizes must divide data_parallel_size")
if any(size > 0 for size in module_tp_sizes) and enabled_configs:
logger.info(
f"finegrained_tp_config enabled: {', '.join(enabled_configs)}")


class AscendCompilationConfig:
"""
Configuration for controlling the behavior of Ascend graph optimization.
Expand Down
Loading
Loading