Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tensorrt_llm/_torch/auto_deploy/config/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ transforms:
############################################################################################
fuse_causal_conv_activation:
stage: compile
multi_stream_moe:
stage: compile
enabled: false
compile_model:
stage: compile
run_per_gm: false
Expand Down
7 changes: 7 additions & 0 deletions tests/integration/defs/accuracy/test_llm_api_autodeploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ def get_default_kwargs(self):
"sharding_source": ['factory', 'heuristic'],
"sharding_dims": ['ep', 'bmm'],
},
"multi_stream_moe": {
"stage": "compile",
"enabled": True,
},
# NOTE: some accuracy benchmarks may require fp32 precision for mamba cache
# "insert_cached_ssm_attention": {
# "cache_config": {
Expand All @@ -201,6 +205,9 @@ def get_default_sampling_params(self):
@pytest.mark.skip_less_device_memory(32000)
def test_bf16(self):
kwargs = self.get_default_kwargs()
# TODO: multi-stream MOE seems to increase the memory usage
kwargs["max_batch_size"] = 32
kwargs["free_mem_ratio"] = 0.5
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
tokenizer=self.MODEL_PATH_BF16,
Expand Down