From 293854a83997d73d7c8157c5c44292f56bf2c90e Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Sat, 1 Nov 2025 21:12:22 +0800 Subject: [PATCH 01/12] remove PyTorchConfig completely Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- .../lm-eval-harness/lm_eval_tensorrt_llm.py | 6 +- tensorrt_llm/_torch/auto_deploy/llm.py | 2 +- tensorrt_llm/_torch/auto_deploy/llm_args.py | 7 - .../_torch/auto_deploy/shim/ad_executor.py | 2 - tensorrt_llm/_torch/pyexecutor/_util.py | 44 +++--- tensorrt_llm/_torch/pyexecutor/config.py | 139 ------------------ .../_torch/pyexecutor/model_engine.py | 3 +- .../_torch/pyexecutor/model_loader.py | 32 +++- .../_torch/pyexecutor/py_executor_creator.py | 65 +++----- tensorrt_llm/llmapi/llm_args.py | 80 +--------- .../test_modeling_llama_min_latency.py | 7 +- 11 files changed, 82 insertions(+), 305 deletions(-) delete mode 100644 tensorrt_llm/_torch/pyexecutor/config.py diff --git a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py index 4fcaf806db2..1738242267d 100644 --- a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py +++ b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py @@ -34,10 +34,10 @@ import tensorrt_llm from tensorrt_llm import LLM as TORCH_LLM from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.bindings.executor import DecodingConfig from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig from tensorrt_llm.llmapi import RequestOutput, SamplingParams +from tensorrt_llm.llmapi.llm_args import MoeConfig logger = logging.getLogger(__name__) @@ -98,10 +98,8 @@ def __init__( pytorch_config_params = { 'cuda_graph_config': {} if use_cuda_graph else None, "print_iter_log": False, + 'moe_config': MoeConfig(backend=self.moe_backend) } - if hasattr(PyTorchConfig, "moe_backend"): - pytorch_config_params["moe_backend"] = self.moe_backend - print(f"Info: moe_backend is set to {self.moe_backend}") # stop words not currently supported by torch backend self.use_stop_words = False diff --git a/tensorrt_llm/_torch/auto_deploy/llm.py b/tensorrt_llm/_torch/auto_deploy/llm.py index 5062ee04054..30d46c81be4 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm.py +++ b/tensorrt_llm/_torch/auto_deploy/llm.py @@ -175,7 +175,7 @@ def __init__(self, **kwargs): self._executor = DemoGenerationExecutor( world_size=self.args.world_size, tokenizer=self.tokenizer, - ad_config=self.args.get_pytorch_backend_config(), + ad_config=self.args, ) def __del__(self): diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index 6f75150cba3..efa8a4c367f 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -403,13 +403,6 @@ def validate_and_init_tokenizer(self): """Skip tokenizer initialization in config. We do this in the AutoDeploy LLM class.""" return self - ### UTILITY METHODS ############################################################################ - # TODO: Remove this after the PyTorch backend is fully migrated to LlmArgs from ExecutorConfig - def get_pytorch_backend_config(self) -> "LlmArgs": - """Return the LlmArgs (self) object.""" - # TODO: can we just pass through self directly?? - return type(self)(**self.to_llm_kwargs()) - def to_dict(self) -> Dict: """Convert model to a dictionary such that cls(**self.to_dict()) == self.""" self_dict = super().to_dict() diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index 527d1f145d6..0b6ba4921b7 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -326,8 +326,6 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer dist.initialize_or_skip(rank, world_size, port) # some config - msg = "pytorch_backend_config must be an AD LlmArgs object" - assert isinstance(ad_config, LlmArgs), msg assert ad_config.max_beam_width <= 1, "_autodeploy + beam_search is not supported" max_num_sequences = ad_config.max_batch_size * dist_mapping.pp_size diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index b5770b30efe..389de7300cf 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -25,7 +25,6 @@ from ..attention_backend import get_sparse_attn_kv_cache_manager from ..model_config import ModelConfig from ..speculative import get_num_extra_kv_tokens, get_spec_decoder -from .config import PyTorchConfig from .config_utils import is_mla, is_nemotron_hybrid, is_qwen3_next from .guided_decoder import GuidedDecoder from .kv_cache_connector import KvCacheConnectorManager @@ -73,7 +72,7 @@ def __init__( max_seq_len: int, max_batch_size: int, kv_cache_config: KvCacheConfig, - pytorch_backend_config: PyTorchConfig, + llm_args: TorchLlmArgs, speculative_config: SpeculativeConfig, sparse_attention_config: SparseAttentionConfig, profiling_stage_data: Optional[dict], @@ -86,7 +85,7 @@ def __init__( self._max_num_tokens = max_num_tokens self._max_beam_width = max_beam_width self._kv_connector_manager = kv_connector_manager - self._pytorch_backend_config = pytorch_backend_config + self._llm_args = llm_args self._speculative_config = speculative_config self._sparse_attention_config = sparse_attention_config self._tokens_per_block = tokens_per_block @@ -248,9 +247,8 @@ def _get_token_num_for_estimation(self) -> int: # estimate_max_kv_cache_tokens submits self._dummy_reqs num_cache_blocks = 0 num_extra_tokens_per_seq = 1 # account for generated tokens - pytorch_backend_config = self._pytorch_backend_config spec_cfg = self._speculative_config - if not pytorch_backend_config.disable_overlap_scheduler: + if not self._llm_args.disable_overlap_scheduler: num_extra_tokens_per_seq = num_extra_tokens_per_seq + 1 if spec_cfg is not None: num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens @@ -653,7 +651,7 @@ def create_py_executor_instance( dist, resources, mapping, - pytorch_backend_config, + llm_args, ctx_chunk_config, model_engine, start_worker, @@ -679,7 +677,7 @@ def create_py_executor_instance( f"max_seq_len={max_seq_len}, max_num_requests={max_batch_size}, max_num_tokens={max_num_tokens}, max_batch_size={max_batch_size}" ) - for key, value in pytorch_backend_config.extra_resource_managers.items(): + for key, value in llm_args.extra_resource_managers.items(): if key in resources: raise ValueError( f"Cannot overwrite existing resource manager {key}.") @@ -804,8 +802,7 @@ def create_py_executor_instance( drafter=drafter, dist=dist, max_num_sequences=max_num_sequences, - disable_overlap_scheduler=pytorch_backend_config. - disable_overlap_scheduler, + disable_overlap_scheduler=llm_args.disable_overlap_scheduler, max_batch_size=max_batch_size, max_beam_width=max_beam_width, max_draft_len=spec_config.max_draft_len @@ -840,13 +837,11 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int, ) -def instantiate_sampler(engine: PyTorchModelEngine, - pytorch_backend_config: PyTorchConfig, mapping: Mapping, - max_batch_size: int, max_beam_width: int, - max_seq_len: int, mm_encoder_only: bool, - speculative_config: SpeculativeConfig, - decoding_config: trtllm.DecodingConfig, - kv_cache_config: KvCacheConfig): +def instantiate_sampler( + engine: PyTorchModelEngine, llm_args: TorchLlmArgs, mapping: Mapping, + max_batch_size: int, max_beam_width: int, max_seq_len: int, + mm_encoder_only: bool, speculative_config: SpeculativeConfig, + decoding_config: trtllm.DecodingConfig, kv_cache_config: KvCacheConfig): sampler_args = create_torch_sampler_args( mapping, max_seq_len=engine.max_seq_len, @@ -856,7 +851,7 @@ def instantiate_sampler(engine: PyTorchModelEngine, decoding_mode = get_decoding_mode(decoding_config=decoding_config, max_beam_width=max_beam_width) if mapping.cp_config.get('cp_type') == CpType.STAR: - assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'" + assert llm_args.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'" return TorchSampler(sampler_args) if engine.spec_config is not None and engine.spec_config.spec_dec_mode.has_spec_decoder( ): @@ -865,15 +860,15 @@ def instantiate_sampler(engine: PyTorchModelEngine, if mm_encoder_only: # NOTE: handle model outputs specially for mm encoder executor/engine return EarlyStopWithMMResult() - if pytorch_backend_config.sampler_type == SamplerType.TRTLLMSampler or ( - pytorch_backend_config.sampler_type == SamplerType.auto + if llm_args.sampler_type == SamplerType.TRTLLMSampler or ( + llm_args.sampler_type == SamplerType.auto and decoding_mode.isBeamSearch()): logger.debug(f"DecodingMode: {decoding_mode.name}") return TRTLLMSampler(engine.model, engine.dtype, mapping, decoding_mode, - pytorch_backend_config.disable_overlap_scheduler, + llm_args.disable_overlap_scheduler, max_seq_len=max_seq_len, max_batch_size=max_batch_size, max_beam_width=max_beam_width, @@ -935,7 +930,12 @@ def _try_infer_num_experts(model_config: ModelConfig) -> int: return num_experts -def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig): +def _adjust_torch_mem_fraction(): + # If true, adjust PyTorch CUDA memory fraction to correspond to the + # total GPU memory minus the statically allocated engine memory. + # If false, set the PyTorch CUDA memory fraction to 1.0. + _limit_torch_cuda_mem_fraction: bool = True + # FIXME: PyTorch only uses the garbage_collection_threshold setting # if a memory fraction is set, cf. # https://github.com/pytorch/pytorch/blob/cd995bfb2aac8891465809be3ce29543bd524287/c10/cuda/CUDACachingAllocator.cpp#L1357 @@ -964,7 +964,7 @@ def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig): # lead PyTorch to release all unused memory before hitting the set fraction. This # still mitigates OOM, although at a higher performance impact, because it # effectively resets the allocator cache. - if not pytorch_backend_config._limit_torch_cuda_mem_fraction: + if not _limit_torch_cuda_mem_fraction: return mem_reserved = torch.cuda.memory_reserved() mem_free, mem_total = torch.cuda.mem_get_info() diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py deleted file mode 100644 index 44b1df26d63..00000000000 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ /dev/null @@ -1,139 +0,0 @@ -from dataclasses import dataclass, field -from typing import Dict, List, Optional, Union - -from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \ - BaseCheckpointLoader - -from ...llmapi.llm_args import LoadFormat, SamplerType -from ..model_config import MoeLoadBalancerConfig -from .resource_manager import BaseResourceManager - - -@dataclass -class PyTorchConfig: - """ - Extra arguments for the pytorch backend. - """ - - # Extra resource managers to use in addition to the KV cache manager. - # Each manager's prepare_resources method is called before the forward pass, - # and update_resources() is called after the pass finishes. free_resources() - # is called when a request finishes. - # The KV cache manager is guaranteed to be invoked after all of these extra - # managers in all stages. - extra_resource_managers: Dict[str, BaseResourceManager] = field( - default_factory=dict) - - # If true, use CUDA graphs for decoding. CUDA graphs are only created - # for the batch sizes in cuda_graph_batch_sizes, and are enabled for - # batches that consist of decoding requests *only* (the reason is that - # it's hard to capture a single graph with prefill requests since the - # input shapes are a function of the sequence lengths). - # Note that each CUDA graph can use up to 200 MB of extra memory. - use_cuda_graph: bool = True - cuda_graph_batch_sizes: Optional[List[int]] = None - cuda_graph_max_batch_size: int = 0 - # If true, batches are rounded up to the nearest cuda_graph_batch_size. - # This is usually a net win for performance. - cuda_graph_padding_enabled: bool = False - disable_overlap_scheduler: bool = False - # If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. - # If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used. - moe_max_num_tokens: Optional[int] = None - moe_load_balancer: Optional[Union[MoeLoadBalancerConfig, dict, str]] = None - - attention_dp_enable_balance: bool = False - attention_dp_time_out_iters: int = 50 - attention_dp_batching_wait_iters: int = 10 - - max_num_tokens: int = 8192 - - batch_wait_timeout_ms: float = 0 - # Iterations to wait before scheduling context even if token budget not reached (0 disables). - batch_wait_timeout_iters: int = 0 - # Threshold ratio of max_num_tokens for token accumulation before scheduling context. - # Value range: [0, 1] (0 disables). - batch_wait_max_tokens_ratio: float = 0.0 - - attn_backend: str = 'TRTLLM' - moe_backend: str = 'CUTLASS' - - moe_disable_finalize_fusion: bool = False - use_low_precision_moe_combine: bool = False - - sampler_type: SamplerType = SamplerType.auto - """ - The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. - Defaults to auto, which will use TorchSampler unless BeamSearch is requested. - """ - - kv_cache_dtype: str = "auto" - mamba_ssm_cache_dtype: str = "auto" - - enable_iter_perf_stats: bool = False - # If true, enables per request stats per iteration - # Must also set enable_iter_perf_stats to true to get request stats - enable_iter_req_stats: bool = False - print_iter_log: bool = False - - torch_compile_enabled: bool = False - torch_compile_fullgraph: bool = True - torch_compile_inductor_enabled: bool = False - torch_compile_piecewise_cuda_graph: bool = False - torch_compile_piecewise_cuda_graph_num_tokens: Optional[List[int]] = None - # When torch compile is enabled, userbuffers is enabled by default - torch_compile_enable_userbuffers: bool = True - torch_compile_max_num_streams: int = 1 - - # Enable autotuner only when torch compile is enabled - # TODO: after it can be work stable in warmup stage - enable_autotuner: bool = True - - # If true, enable layerwise nvtx marker - enable_layerwise_nvtx_marker: bool = False - # How to load the model weights. By default, detect the weight type - # from the model checkpoint. - load_format: Union[str, LoadFormat] = 'auto' - - # If true, enable min-latency mode. Currently only used for Llama4. - enable_min_latency: bool = False - allreduce_strategy: str = "AUTO" - - # The iteration interval to create responses under the streaming mode. - # TODO: make this a per-request parameter - stream_interval: int = 1 - - force_dynamic_quantization: bool = False - - # If true, ONLY the vision encoder part of the full model is loaded/executed. - mm_encoder_only: bool = False - - # If true, adjust PyTorch CUDA memory fraction to correspond to the - # total GPU memory minus the statically allocated engine memory. - # If false, set the PyTorch CUDA memory fraction to 1.0. - _limit_torch_cuda_mem_fraction: bool = True - - -def _construct_checkpoint_loader( - backend: str, checkpoint_loader: Optional[BaseCheckpointLoader], - checkpoint_format: Optional[str]) -> Optional[BaseCheckpointLoader]: - if backend == "_autodeploy": - return None - - from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \ - BaseCheckpointLoader - from tensorrt_llm._torch.models.modeling_utils import ( - get_checkpoint_weight_loader, get_config_loader) - - if checkpoint_loader is None: - checkpoint_weight_loader = get_checkpoint_weight_loader( - checkpoint_format)() - config_loader = get_config_loader(checkpoint_format)() - - checkpoint_loader = BaseCheckpointLoader.get( - checkpoint_format=checkpoint_format, - weight_loader=checkpoint_weight_loader, - weight_mapper=None, - config_loader=config_loader) - - return checkpoint_loader diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index e6da9fc216a..e3c12e36b49 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -54,13 +54,12 @@ from ..utils import (get_model_extra_attrs, set_per_request_piecewise_cuda_graph_flag, set_torch_compiling, with_model_extra_attrs) -from .config import _construct_checkpoint_loader from .config_utils import is_mla from .cuda_graph_runner import CUDAGraphRunner from .guided_decoder import CapturableGuidedDecoder from .layerwise_nvtx_marker import LayerwiseNvtxMarker from .llm_request import get_draft_token_length -from .model_loader import ModelLoader +from .model_loader import ModelLoader, _construct_checkpoint_loader from .resource_manager import (BaseResourceManager, KVCacheManager, ResourceManager, ResourceManagerType) from .sampler import SampleStateTensors diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py index 7f5ade11655..f4eff586cd2 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_loader.py +++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py @@ -6,6 +6,8 @@ import torch +from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \ + BaseCheckpointLoader from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.llmapi.llm_args import TorchLlmArgs from tensorrt_llm.logger import logger @@ -14,6 +16,7 @@ from tensorrt_llm.models.modeling_utils import QuantAlgo from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2 +from ...llmapi.llm_args import LoadFormat from ..model_config import ModelConfig from ..models import AutoModelForCausalLM from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader @@ -63,7 +66,7 @@ def validate_and_set_kv_cache_quant(model_config: ModelConfig, if not valid_pyt_quant: raise ValueError( "Overriding KV cache quantization with an invalid type " - f'"PyTorchConfig.kv_cache_dtype="{pyt_kv_cache_dtype}" ' + f'"llm_args.KvCacheConfig.dtype="{pyt_kv_cache_dtype}" ' f'Accepted types are "{_VALID_KV_CACHE_DTYPES}".') # If we get to this point we have a valid quantization setting, but if @@ -71,7 +74,7 @@ def validate_and_set_kv_cache_quant(model_config: ModelConfig, if kv_cache_quant is not None and mapped_pyt_quant != kv_cache_quant: raise RuntimeError( "Attempting to override KV cache quantization " - f'"{kv_cache_quant}" with PyTorchConfig.kv_cache_dtype=' + f'"{kv_cache_quant}" with llm_args.KvCacheConfig.dtype=' f'"{pyt_kv_cache_dtype}". You cannot override a checkpoint with a ' "pre-quantized KV cache that doesn't match.") @@ -151,6 +154,31 @@ def get_rank_model_storage(model): return total_bytes +def _construct_checkpoint_loader( + backend: str, checkpoint_loader: Optional[BaseCheckpointLoader], + checkpoint_format: Optional[str]) -> Optional[BaseCheckpointLoader]: + if backend == "_autodeploy": + return None + + from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \ + BaseCheckpointLoader + from tensorrt_llm._torch.models.modeling_utils import ( + get_checkpoint_weight_loader, get_config_loader) + + if checkpoint_loader is None: + checkpoint_weight_loader = get_checkpoint_weight_loader( + checkpoint_format)() + config_loader = get_config_loader(checkpoint_format)() + + checkpoint_loader = BaseCheckpointLoader.get( + checkpoint_format=checkpoint_format, + weight_loader=checkpoint_weight_loader, + weight_mapper=None, + config_loader=config_loader) + + return checkpoint_loader + + class ModelLoader: """ Handles the loading, configuration, and weight initialization of a PyTorch model. diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index c7258aa7157..4f5c885d490 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -32,7 +32,6 @@ from ._util import (KvCacheCreator, _adjust_torch_mem_fraction, create_py_executor_instance, instantiate_sampler, is_mla, validate_feature_combination) -from .config import PyTorchConfig from .config_utils import is_mla from .guided_decoder import CapturableGuidedDecoder, GuidedDecoder from .kv_cache_connector import KvCacheConnectorManager @@ -210,10 +209,6 @@ def create_py_executor( lora_config = llm_args.lora_config kv_connector_config = llm_args.kv_connector_config - pytorch_backend_config = llm_args.get_pytorch_backend_config() - if pytorch_backend_config is None: - pytorch_backend_config = PyTorchConfig() - scheduler_config = llm_args.scheduler_config # Since peft_cache_config may be subject to change, avoid these changes propagate back @@ -242,23 +237,19 @@ def create_py_executor( ) = llm_args.get_runtime_sizes() tokens_per_block = kv_cache_config.tokens_per_block - if pytorch_backend_config.attn_backend == "VANILLA": + if llm_args.attn_backend == "VANILLA": tokens_per_block = max_num_tokens - if pytorch_backend_config.attn_backend in [ - "FLASHINFER", "FLASHINFER_STAR_ATTENTION" - ]: + if llm_args.attn_backend in ["FLASHINFER", "FLASHINFER_STAR_ATTENTION"]: # Workaround for flashinfer and star attention if kv_cache_config.enable_block_reuse: logger.warning( - f"Disabling block reuse for {pytorch_backend_config.attn_backend} backend" - ) + f"Disabling block reuse for {llm_args.attn_backend} backend") kv_cache_config.enable_block_reuse = False - if pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION" and enable_chunked_context: + if llm_args.attn_backend == "FLASHINFER_STAR_ATTENTION" and enable_chunked_context: logger.warning( - f"Disabling chunked context for {pytorch_backend_config.attn_backend} backend" - ) + f"Disabling chunked context for {llm_args.attn_backend} backend") enable_chunked_context = False spec_config = llm_args.speculative_config @@ -266,30 +257,23 @@ def create_py_executor( from tensorrt_llm._torch.speculative import suggest_spec_config spec_config = suggest_spec_config(max_batch_size) - if not pytorch_backend_config.disable_overlap_scheduler and spec_config is not None: + if not llm_args.disable_overlap_scheduler and spec_config is not None: if not spec_config.spec_dec_mode.support_overlap_scheduler(): logger.warning( f"Disable overlap scheduler for speculation mode {spec_config.spec_dec_mode.name}" ) - # TODO(qijun): clean up pytorch_backend_config later - pytorch_backend_config.disable_overlap_scheduler = True - llm_args.disable_overlap_scheduler = True if mm_encoder_only: - # TODO(qijun): clean up pytorch_backend_config later - pytorch_backend_config.mm_encoder_only = True - pytorch_backend_config.load_format = LoadFormat.VISION_ONLY + llm_args.mm_encoder_only = True + llm_args.load_format = LoadFormat.VISION_ONLY + llm_args.disable_overlap_scheduler = True + # Disable overlap scheduler for multimodal encoder-only mode logger.warning( "Disabling overlap scheduler for multimodal encoder-only mode. " "The overlap scheduler is designed for generation models and is not needed " "when only processing vision encoder inputs.") - pytorch_backend_config.disable_overlap_scheduler = True - - llm_args.mm_encoder_only = True - llm_args.load_format = LoadFormat.VISION_ONLY - llm_args.disable_overlap_scheduler = True mapping = _get_mapping(llm_args.parallel_config.to_mapping()) if mpi_disabled(): @@ -326,19 +310,17 @@ def create_py_executor( spec_config=spec_config, ) - validate_feature_combination(llm_args, model_engine, - pytorch_backend_config.sampler_type) + validate_feature_combination(llm_args, model_engine, llm_args.sampler_type) if has_draft_model_engine: with mem_monitor.observe_creation_stage( _ExecutorCreationStage.MODEL_ENGINE_DRAFT): draft_spec_config = copy.copy(spec_config) - use_chain_drafter = ( - guided_decoding_config is None - and draft_spec_config._allow_chain_drafter - and draft_spec_config._allow_greedy_draft_tokens - and pytorch_backend_config.attn_backend == "TRTLLM") + use_chain_drafter = (guided_decoding_config is None + and draft_spec_config._allow_chain_drafter and + draft_spec_config._allow_greedy_draft_tokens + and llm_args.attn_backend == "TRTLLM") logger.debug(f"USE CHAIN DRAFTER: {use_chain_drafter}") if use_chain_drafter: @@ -353,11 +335,8 @@ def drafting_loop_wrapper(model): else: drafting_loop_wrapper = None - # TODO(qijun): clean up pytorch_backend_config later - draft_pytorch_backend_config = copy.copy(pytorch_backend_config) draft_llm_args = copy.copy(llm_args) if spec_config.load_format == "dummy": - draft_pytorch_backend_config.load_format = LoadFormat.DUMMY draft_llm_args.load_format = LoadFormat.DUMMY draft_model_engine = PyTorchModelEngine( @@ -382,7 +361,7 @@ def drafting_loop_wrapper(model): # PyTorchModelEngine modifies these fields, update them model_engine_max_seq_len = model_engine.max_seq_len net_max_seq_len = model_engine_max_seq_len - if not pytorch_backend_config.disable_overlap_scheduler: + if not llm_args.disable_overlap_scheduler: model_engine_max_seq_len = model_engine.max_seq_len + 1 if spec_config is not None: model_engine_max_seq_len += spec_config.max_total_draft_tokens @@ -483,7 +462,7 @@ def drafting_loop_wrapper(model): with mem_monitor.observe_creation_stage(_ExecutorCreationStage.SAMPLER): sampler = instantiate_sampler(model_engine, - pytorch_backend_config, + llm_args, mapping, max_batch_size=max_batch_size, max_beam_width=max_beam_width, @@ -561,7 +540,7 @@ def drafting_loop_wrapper(model): max_seq_len=max_seq_len, max_batch_size=max_batch_size, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_backend_config, + llm_args=llm_args, speculative_config=spec_config, profiling_stage_data=profiling_stage_data, sparse_attention_config=sparse_attention_config, @@ -600,7 +579,7 @@ def drafting_loop_wrapper(model): dist=dist, resources=resources, mapping=mapping, - pytorch_backend_config=pytorch_backend_config, + llm_args=llm_args, ctx_chunk_config=ctx_chunk_config, model_engine=model_engine, start_worker=False, @@ -647,7 +626,7 @@ def drafting_loop_wrapper(model): if eng is None: continue if eng.attn_metadata is not None: - if pytorch_backend_config.use_cuda_graph: + if llm_args.cuda_graph_config is not None: eng._release_cuda_graphs() eng.attn_metadata = None @@ -657,7 +636,7 @@ def drafting_loop_wrapper(model): dist=dist, resources=resources, mapping=mapping, - pytorch_backend_config=pytorch_backend_config, + llm_args=llm_args, ctx_chunk_config=ctx_chunk_config, model_engine=model_engine, start_worker=False, @@ -677,7 +656,7 @@ def drafting_loop_wrapper(model): cache_transceiver_config=cache_transceiver_config, ) - _adjust_torch_mem_fraction(pytorch_backend_config) + _adjust_torch_mem_fraction() py_executor.start_worker() return py_executor diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 01e0a4c745c..77ef96a49d6 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -8,9 +8,8 @@ from dataclasses import dataclass from enum import Enum, EnumMeta from pathlib import Path -from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional, - Set, Tuple, Type, TypeAlias, TypeVar, Union, get_args, - get_origin) +from typing import (Any, ClassVar, Dict, List, Literal, Optional, Set, Tuple, + Type, TypeAlias, TypeVar, Union, get_args, get_origin) import torch import yaml @@ -25,9 +24,6 @@ from .._utils import mpi_rank -if TYPE_CHECKING: - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - # yapf: disable # isort: off from ..bindings.executor import (BatchingType as _BatchingType, @@ -2831,78 +2827,6 @@ def get_executor_config( executor_config.mm_encoder_only = self.mm_encoder_only return executor_config - # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig - def get_pytorch_backend_config(self) -> "PyTorchConfig": - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - - return PyTorchConfig( - extra_resource_managers=self.extra_resource_managers, - use_cuda_graph=bool(self.cuda_graph_config is not None), - cuda_graph_batch_sizes=self.cuda_graph_config.batch_sizes - if self.cuda_graph_config else - CudaGraphConfig.model_fields['batch_sizes'].default, - cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size - if self.cuda_graph_config else - CudaGraphConfig.model_fields['max_batch_size'].default, - cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding - if self.cuda_graph_config else - CudaGraphConfig.model_fields['enable_padding'].default, - disable_overlap_scheduler=self.disable_overlap_scheduler, - moe_max_num_tokens=self.moe_config.max_num_tokens, - moe_load_balancer=self.moe_config.load_balancer, - attn_backend=self.attn_backend, - moe_backend=self.moe_config.backend, - use_low_precision_moe_combine=self.moe_config. - use_low_precision_moe_combine, - sampler_type=self.sampler_type, - kv_cache_dtype=self.kv_cache_config.dtype, - mamba_ssm_cache_dtype=self.kv_cache_config.mamba_ssm_cache_dtype, - enable_iter_perf_stats=self.enable_iter_perf_stats, - enable_iter_req_stats=self.enable_iter_req_stats, - print_iter_log=self.print_iter_log, - torch_compile_enabled=bool(self.torch_compile_config is not None), - torch_compile_fullgraph=self.torch_compile_config.enable_fullgraph - if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['enable_fullgraph'].default, - torch_compile_inductor_enabled=self.torch_compile_config. - enable_inductor if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['enable_inductor'].default, - torch_compile_piecewise_cuda_graph=self.torch_compile_config. - enable_piecewise_cuda_graph - if self.torch_compile_config is not None else TorchCompileConfig. - model_fields['enable_piecewise_cuda_graph'].default, - torch_compile_piecewise_cuda_graph_num_tokens=self. - torch_compile_config.capture_num_tokens - if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['capture_num_tokens'].default, - torch_compile_enable_userbuffers=self.torch_compile_config. - enable_userbuffers if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['enable_userbuffers'].default, - torch_compile_max_num_streams=self.torch_compile_config. - max_num_streams if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['max_num_streams'].default, - enable_autotuner=self.enable_autotuner, - enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker, - load_format=self.load_format, - enable_min_latency=self.enable_min_latency, - moe_disable_finalize_fusion=self.moe_config.disable_finalize_fusion, - stream_interval=self.stream_interval, - force_dynamic_quantization=self.force_dynamic_quantization, - allreduce_strategy=self.allreduce_strategy, - attention_dp_enable_balance=bool( - self.attention_dp_config is not None - and self.attention_dp_config.enable_balance), - attention_dp_time_out_iters=self.attention_dp_config.timeout_iters - if self.attention_dp_config is not None else - AttentionDpConfig.model_fields['timeout_iters'].default, - attention_dp_batching_wait_iters=self.attention_dp_config. - batching_wait_iters if self.attention_dp_config is not None else - AttentionDpConfig.model_fields['batching_wait_iters'].default, - batch_wait_timeout_ms=self.batch_wait_timeout_ms, - batch_wait_timeout_iters=self.batch_wait_timeout_iters, - batch_wait_max_tokens_ratio=self.batch_wait_max_tokens_ratio, - ) - def update_llm_args_with_extra_dict( llm_args: Dict, diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py index 9f96f146b8d..fac1e283d76 100644 --- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py +++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py @@ -20,7 +20,6 @@ Llama4HfWeightMapper from tensorrt_llm._torch.models.modeling_llama import \ Llama4ForConditionalGeneration -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig @@ -158,8 +157,7 @@ def test_llama_sanity(self, scenario: SanityScenario): with torch.device(device), default_dtype(dtype): model_config = ModelConfig(pretrained_config=llama_config, quant_config=quant_config) - model_config.pytorch_backend_config = PyTorchConfig( - enable_min_latency=enable_min_latency) + model_config.enable_min_latency = enable_min_latency llama = Llama4ForConditionalGeneration(model_config) input_ids = torch.tensor([100, 200, 300, 100, 200, 100, 400, 500], @@ -291,8 +289,7 @@ def test_llama_allclose_to_hf(self, scenario: AllCloseScenario) -> None: model_config = ModelConfig(pretrained_config=llama_config, attn_backend=attention_backend) - model_config.pytorch_backend_config = PyTorchConfig( - enable_min_latency=enable_min_latency) + model_config.enable_min_latency = enable_min_latency llama = Llama4ForConditionalGeneration(model_config) weight_mapper = Llama4HfWeightMapper() weight_mapper.init_model_and_config(llama, model_config) From c649d35796aa4656ff8f29a213b9f26e70657a1a Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Sat, 1 Nov 2025 21:16:45 +0800 Subject: [PATCH 02/12] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/base_worker.py | 2 +- .../auto_deploy/unit/singlegpu/shim/test_llm_config.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index ff6d3402bb6..62d19eee1b2 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -128,7 +128,7 @@ def _create_py_executor(): create_autodeploy_executor create_executor = create_autodeploy_executor assert isinstance(self.llm_args, ADLlmArgs) - args["ad_config"] = self.llm_args.get_pytorch_backend_config() + args["ad_config"] = self.llm_args args["tokenizer"] = self._tokenizer else: raise ValueError(f"Unsupported backend config: {self._backend}") diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py index 12f711d7267..04fa1f91fb6 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py @@ -65,12 +65,6 @@ def get_transform_config(free_mem_ratio): InferenceOptimizer(None, get_transform_config(1.1)) -def test_get_pytorch_backend_config(): - """Test that get_pytorch_backend_config returns self.""" - args = LlmArgs(model="test-model") - assert args.get_pytorch_backend_config() == args - - # ================================ # Config Flow Tests # ================================ From ebb9b733f92789799bf2247e1ffbac99f9b33fe8 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Sun, 2 Nov 2025 13:24:58 +0800 Subject: [PATCH 03/12] fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/model_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py index f4eff586cd2..b9c1377cd98 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_loader.py +++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py @@ -23,7 +23,6 @@ from ..models.modeling_utils import MetaInitMode, timing from ..modules.fused_moe.moe_load_balancer import ( MoeLoadBalancer, maybe_create_moe_load_balancer) -from .config import LoadFormat _KV_CACHE_MAP = { "fp8": QuantAlgo.FP8.value, From 8910a7b8eed8b01e79a026fd0957389967a2d7f5 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Sun, 2 Nov 2025 15:32:29 +0800 Subject: [PATCH 04/12] fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2600235395b..1514c9afcbe 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -19,9 +19,9 @@ from defs.conftest import get_sm_version from tensorrt_llm import LLM +from tensorrt_llm._torch.model_config import MoeLoadBalancerConfig from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ IS_TRITON_KERNELS_AVAILABLE -from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig, EagleDecodingConfig, KvCacheConfig, MoeConfig, MTPDecodingConfig, NGramDecodingConfig, From 9b4fefaa47c031c5d1913506d6ce229d14dbc835 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Sun, 2 Nov 2025 18:57:47 +0800 Subject: [PATCH 05/12] fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/configuration.py | 5 ----- tensorrt_llm/executor/base_worker.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py index 742862134b7..06d25b5df85 100755 --- a/tensorrt_llm/bench/dataclasses/configuration.py +++ b/tensorrt_llm/bench/dataclasses/configuration.py @@ -8,7 +8,6 @@ model_validator) import tensorrt_llm.bindings.executor as trtllm -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import (BatchingType, CapacitySchedulerPolicy, ContextChunkingPolicy, DynamicBatchConfig, ExtendedRuntimePerfKnobConfig, KvCacheConfig, @@ -74,7 +73,6 @@ def get_llm_args(self) -> Dict: } backend_config_map = { - "pytorch": self.performance_options.get_pytorch_perf_config, "_autodeploy": self.performance_options.get_autodeploy_perf_config } @@ -126,9 +124,6 @@ def get_perf_config(self) -> ExtendedRuntimePerfKnobConfig: return config - def get_pytorch_perf_config(self) -> PyTorchConfig: - return self.pytorch_config - def get_autodeploy_perf_config(self) -> Dict: AutoDeployPerfConfig = dict ad_config = AutoDeployPerfConfig() diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index 62d19eee1b2..7c01d795664 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -137,7 +137,7 @@ def _create_py_executor(): self.mapping = self.llm_args.parallel_config.to_mapping() self.checkpoint_loader = None if self._backend == "pytorch": - from tensorrt_llm._torch.pyexecutor.config import \ + from tensorrt_llm._torch.pyexecutor.model_loader import \ _construct_checkpoint_loader self.checkpoint_loader = _construct_checkpoint_loader( self.llm_args.backend, self.llm_args.checkpoint_loader, From 2d812d92b3386636633ab9b520dc26764e3ffe2e Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Sun, 2 Nov 2025 20:48:36 +0800 Subject: [PATCH 06/12] fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/configuration.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py index 06d25b5df85..a2e28381d78 100755 --- a/tensorrt_llm/bench/dataclasses/configuration.py +++ b/tensorrt_llm/bench/dataclasses/configuration.py @@ -12,6 +12,7 @@ ContextChunkingPolicy, DynamicBatchConfig, ExtendedRuntimePerfKnobConfig, KvCacheConfig, SchedulerConfig) +from tensorrt_llm.llmapi.llm_args import CudaGraphConfig from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode @@ -87,9 +88,9 @@ def get_llm_args(self) -> Dict: llm_args, self.extra_llm_api_options) if self.backend == "pytorch": - cuda_graph_config = updated_llm_args.pop( - "cuda_graph_config", llm_args["cuda_graph_config"]) - if cuda_graph_config: + cuda_graph_config = updated_llm_args.pop("cuda_graph_config", + CudaGraphConfig()) + if cuda_graph_config is not None: # Use runtime max_batch_size as cuda_graph_config.max_batch_size # if both max_batch_size and batch_sizes are not set. batch_sizes_set = cuda_graph_config.get("batch_sizes", @@ -114,7 +115,6 @@ class PerformanceOptions: cuda_graphs: bool = False multi_block_mode: bool = True cuda_graph_cache_size: int = 1000 - pytorch_config: Dict[str, Any] = Field(default_factory=dict) def get_perf_config(self) -> ExtendedRuntimePerfKnobConfig: config = ExtendedRuntimePerfKnobConfig() From 189396d05d09c65364f158fd1591758f73e5647f Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Mon, 3 Nov 2025 09:22:24 +0800 Subject: [PATCH 07/12] fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_llama_min_latency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py index 027eeeace20..540613d1161 100644 --- a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py +++ b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py @@ -23,7 +23,7 @@ WeightsLoadingConfig) from ..modules.multi_stream_utils import maybe_execute_in_parallel from ..speculative import SpecMetadata -from ..utils import Fp4QuantizedTensor +from ..utils import AuxStreamType, Fp4QuantizedTensor from .modeling_llama import Llama4Attention, Llama4DecoderLayer, Llama4MoE # Perf heuristics thresholds. @@ -452,7 +452,7 @@ def __init__( dtype=dtype, reduce_results=reduce_results, model_config=model_config, - aux_stream=aux_stream, + aux_stream_dict={AuxStreamType.Attention: aux_stream}, weight_loading_mode=weight_loading_mode, apply_router_weight_on_input=apply_router_weight_on_input, ) From a3e1264a7ec3953d9b65d9375a1bfa4b4179799e Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 5 Nov 2025 09:48:00 +0800 Subject: [PATCH 08/12] fix Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/configuration.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py index a2e28381d78..63c5970fd0a 100755 --- a/tensorrt_llm/bench/dataclasses/configuration.py +++ b/tensorrt_llm/bench/dataclasses/configuration.py @@ -12,7 +12,6 @@ ContextChunkingPolicy, DynamicBatchConfig, ExtendedRuntimePerfKnobConfig, KvCacheConfig, SchedulerConfig) -from tensorrt_llm.llmapi.llm_args import CudaGraphConfig from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode @@ -74,6 +73,7 @@ def get_llm_args(self) -> Dict: } backend_config_map = { + "pytorch": self.performance_options.get_pytorch_perf_config, "_autodeploy": self.performance_options.get_autodeploy_perf_config } @@ -88,9 +88,9 @@ def get_llm_args(self) -> Dict: llm_args, self.extra_llm_api_options) if self.backend == "pytorch": - cuda_graph_config = updated_llm_args.pop("cuda_graph_config", - CudaGraphConfig()) - if cuda_graph_config is not None: + cuda_graph_config = updated_llm_args.pop( + "cuda_graph_config", llm_args["cuda_graph_config"]) + if cuda_graph_config: # Use runtime max_batch_size as cuda_graph_config.max_batch_size # if both max_batch_size and batch_sizes are not set. batch_sizes_set = cuda_graph_config.get("batch_sizes", @@ -115,6 +115,7 @@ class PerformanceOptions: cuda_graphs: bool = False multi_block_mode: bool = True cuda_graph_cache_size: int = 1000 + pytorch_config: Dict[str, Any] = Field(default_factory=dict) def get_perf_config(self) -> ExtendedRuntimePerfKnobConfig: config = ExtendedRuntimePerfKnobConfig() @@ -124,6 +125,9 @@ def get_perf_config(self) -> ExtendedRuntimePerfKnobConfig: return config + def get_pytorch_perf_config(self): + return self.pytorch_config + def get_autodeploy_perf_config(self) -> Dict: AutoDeployPerfConfig = dict ad_config = AutoDeployPerfConfig() From 74d31834fcafa1fe3a748b863f8787971848e09b Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:30:59 +0800 Subject: [PATCH 09/12] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_llama_min_latency.py | 4 ++-- .../_torch/modeling/test_modeling_llama_min_latency.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py index 540613d1161..41ffe3b63ac 100644 --- a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py +++ b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py @@ -23,7 +23,7 @@ WeightsLoadingConfig) from ..modules.multi_stream_utils import maybe_execute_in_parallel from ..speculative import SpecMetadata -from ..utils import AuxStreamType, Fp4QuantizedTensor +from ..utils import Fp4QuantizedTensor from .modeling_llama import Llama4Attention, Llama4DecoderLayer, Llama4MoE # Perf heuristics thresholds. @@ -452,7 +452,7 @@ def __init__( dtype=dtype, reduce_results=reduce_results, model_config=model_config, - aux_stream_dict={AuxStreamType.Attention: aux_stream}, + aux_stream_dict=aux_stream, weight_loading_mode=weight_loading_mode, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py index fac1e283d76..ac2575d22ce 100644 --- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py +++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py @@ -158,6 +158,8 @@ def test_llama_sanity(self, scenario: SanityScenario): model_config = ModelConfig(pretrained_config=llama_config, quant_config=quant_config) model_config.enable_min_latency = enable_min_latency + # TODO: enable llama4 min latency test + model_config.enable_min_latency = False llama = Llama4ForConditionalGeneration(model_config) input_ids = torch.tensor([100, 200, 300, 100, 200, 100, 400, 500], From b868895921aafd5f1e9d0bf7ea19d193bf676d96 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:34:22 +0800 Subject: [PATCH 10/12] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- .../unittest/_torch/modeling/test_modeling_llama_min_latency.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py index ac2575d22ce..367dee787a0 100644 --- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py +++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py @@ -292,6 +292,8 @@ def test_llama_allclose_to_hf(self, scenario: AllCloseScenario) -> None: model_config = ModelConfig(pretrained_config=llama_config, attn_backend=attention_backend) model_config.enable_min_latency = enable_min_latency + # TODO: enable llama4 min latency test + model_config.enable_min_latency = False llama = Llama4ForConditionalGeneration(model_config) weight_mapper = Llama4HfWeightMapper() weight_mapper.init_model_and_config(llama, model_config) From 86dece804a994c7b0a7b81eaee97fd1d3a97b953 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 5 Nov 2025 16:16:52 +0800 Subject: [PATCH 11/12] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_llama_min_latency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py index 41ffe3b63ac..027eeeace20 100644 --- a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py +++ b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py @@ -452,7 +452,7 @@ def __init__( dtype=dtype, reduce_results=reduce_results, model_config=model_config, - aux_stream_dict=aux_stream, + aux_stream=aux_stream, weight_loading_mode=weight_loading_mode, apply_router_weight_on_input=apply_router_weight_on_input, ) From 3481c82b7d69bd6f5d0aa3a025826afa692bdd76 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Wed, 5 Nov 2025 16:34:29 +0800 Subject: [PATCH 12/12] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/config.py | 142 ----------------------- 1 file changed, 142 deletions(-) delete mode 100644 tensorrt_llm/_torch/pyexecutor/config.py diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py deleted file mode 100644 index 83f4aa909f1..00000000000 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ /dev/null @@ -1,142 +0,0 @@ -from dataclasses import dataclass, field -from typing import Dict, List, Optional, Union - -from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \ - BaseCheckpointLoader - -from ...llmapi.llm_args import LoadFormat, SamplerType -from ..model_config import MoeLoadBalancerConfig -from .resource_manager import BaseResourceManager - - -@dataclass -class PyTorchConfig: - """ - Extra arguments for the pytorch backend. - """ - - # Extra resource managers to use in addition to the KV cache manager. - # Each manager's prepare_resources method is called before the forward pass, - # and update_resources() is called after the pass finishes. free_resources() - # is called when a request finishes. - # The KV cache manager is guaranteed to be invoked after all of these extra - # managers in all stages. - extra_resource_managers: Dict[str, BaseResourceManager] = field( - default_factory=dict) - - # If true, use CUDA graphs for decoding. CUDA graphs are only created - # for the batch sizes in cuda_graph_batch_sizes, and are enabled for - # batches that consist of decoding requests *only* (the reason is that - # it's hard to capture a single graph with prefill requests since the - # input shapes are a function of the sequence lengths). - # Note that each CUDA graph can use up to 200 MB of extra memory. - use_cuda_graph: bool = True - cuda_graph_batch_sizes: Optional[List[int]] = None - cuda_graph_max_batch_size: int = 0 - # If true, batches are rounded up to the nearest cuda_graph_batch_size. - # This is usually a net win for performance. - cuda_graph_padding_enabled: bool = False - disable_overlap_scheduler: bool = False - # If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. - # If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used. - moe_max_num_tokens: Optional[int] = None - moe_load_balancer: Optional[Union[MoeLoadBalancerConfig, dict, str]] = None - - attention_dp_enable_balance: bool = False - attention_dp_time_out_iters: int = 50 - attention_dp_batching_wait_iters: int = 10 - - max_num_tokens: int = 8192 - - batch_wait_timeout_ms: float = 0 - # Iterations to wait before scheduling context even if token budget not reached (0 disables). - batch_wait_timeout_iters: int = 0 - # Threshold ratio of max_num_tokens for token accumulation before scheduling context. - # Value range: [0, 1] (0 disables). - batch_wait_max_tokens_ratio: float = 0.0 - - attn_backend: str = 'TRTLLM' - moe_backend: str = 'CUTLASS' - - moe_disable_finalize_fusion: bool = False - use_low_precision_moe_combine: bool = False - - sampler_type: SamplerType = SamplerType.auto - """ - The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. - Defaults to auto, which will use TorchSampler unless BeamSearch is requested. - """ - - kv_cache_dtype: str = "auto" - mamba_ssm_cache_dtype: str = "auto" - - enable_iter_perf_stats: bool = False - # If true, enables per request stats per iteration - # Must also set enable_iter_perf_stats to true to get request stats - enable_iter_req_stats: bool = False - print_iter_log: bool = False - - torch_compile_enabled: bool = False - torch_compile_fullgraph: bool = True - torch_compile_inductor_enabled: bool = False - torch_compile_piecewise_cuda_graph: bool = False - torch_compile_piecewise_cuda_graph_num_tokens: Optional[List[int]] = None - # When torch compile is enabled, userbuffers is enabled by default - torch_compile_enable_userbuffers: bool = True - torch_compile_max_num_streams: int = 1 - - # Enable autotuner only when torch compile is enabled - # TODO: after it can be work stable in warmup stage - enable_autotuner: bool = True - - # If true, enable layerwise nvtx marker - enable_layerwise_nvtx_marker: bool = False - # How to load the model weights. By default, detect the weight type - # from the model checkpoint. - load_format: Union[str, LoadFormat] = 'auto' - - # If true, enable min-latency mode. Currently only used for Llama4. - enable_min_latency: bool = False - allreduce_strategy: str = "AUTO" - - # The iteration interval to create responses under the streaming mode. - # TODO: make this a per-request parameter - stream_interval: int = 1 - - force_dynamic_quantization: bool = False - - # If true, ONLY the vision encoder part of the full model is loaded/executed. - mm_encoder_only: bool = False - - # Enable extra setup to support sleep feature. - enable_sleep: bool = False - - # If true, adjust PyTorch CUDA memory fraction to correspond to the - # total GPU memory minus the statically allocated engine memory. - # If false, set the PyTorch CUDA memory fraction to 1.0. - _limit_torch_cuda_mem_fraction: bool = True - - -def _construct_checkpoint_loader( - backend: str, checkpoint_loader: Optional[BaseCheckpointLoader], - checkpoint_format: Optional[str]) -> Optional[BaseCheckpointLoader]: - if backend == "_autodeploy": - return None - - from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \ - BaseCheckpointLoader - from tensorrt_llm._torch.models.modeling_utils import ( - get_checkpoint_weight_loader, get_config_loader) - - if checkpoint_loader is None: - checkpoint_weight_loader = get_checkpoint_weight_loader( - checkpoint_format)() - config_loader = get_config_loader(checkpoint_format)() - - checkpoint_loader = BaseCheckpointLoader.get( - checkpoint_format=checkpoint_format, - weight_loader=checkpoint_weight_loader, - weight_mapper=None, - config_loader=config_loader) - - return checkpoint_loader