99import tensorrt_llm .bindings .executor as trtllm
1010from tensorrt_llm ._torch .model_config import ModelConfig
1111from tensorrt_llm ._utils import str_dtype_to_binding , torch_dtype_to_str
12+ from tensorrt_llm .bindings .executor import \
13+ CacheTransceiverConfig as _CacheTransceiverConfig
1214from tensorrt_llm .bindings .executor import DecodingMode , ExecutorConfig
15+ from tensorrt_llm .bindings .executor import PeftCacheConfig as _PeftCacheConfig
16+ from tensorrt_llm .bindings .executor import SchedulerConfig as _SchedulerConfig
1317from tensorrt_llm .llmapi .llm_args import PeftCacheConfig , SamplerType
1418from tensorrt_llm .logger import logger
1519from tensorrt_llm .lora_helper import (LoraConfig ,
@@ -504,7 +508,6 @@ def create_py_executor_instance(
504508 resources ,
505509 mapping ,
506510 pytorch_backend_config ,
507- executor_config ,
508511 ctx_chunk_config ,
509512 model_engine ,
510513 start_worker ,
@@ -515,13 +518,19 @@ def create_py_executor_instance(
515518 garbage_collection_gen0_threshold : Optional [int ] = None ,
516519 kv_connector_manager : Optional [KvCacheConnectorManager ] = None ,
517520 max_seq_len : Optional [int ] = None ,
521+ max_batch_size : Optional [int ] = None ,
522+ max_beam_width : Optional [int ] = None ,
523+ max_num_tokens : Optional [int ] = None ,
524+ peft_cache_config : Optional [_PeftCacheConfig ] = None ,
525+ scheduler_config : Optional [_SchedulerConfig ] = None ,
526+ cache_transceiver_config : Optional [_CacheTransceiverConfig ] = None ,
518527) -> PyExecutor :
519528 kv_cache_manager = resources .get (ResourceManagerType .KV_CACHE_MANAGER , None )
520529
521530 spec_config = model_engine .spec_config
522531
523532 logger .info (
524- f"max_seq_len={ executor_config . max_seq_len } , max_num_requests={ executor_config . max_batch_size } , max_num_tokens={ executor_config . max_num_tokens } , max_batch_size={ executor_config . max_batch_size } "
533+ f"max_seq_len={ max_seq_len } , max_num_requests={ max_batch_size } , max_num_tokens={ max_num_tokens } , max_batch_size={ max_batch_size } "
525534 )
526535
527536 for key , value in pytorch_backend_config .extra_resource_managers .items ():
@@ -578,16 +587,15 @@ def create_py_executor_instance(
578587 len (lora_config .lora_target_modules + lora_config .missing_qkv_modules )
579588
580589 peft_cache_config_model = PeftCacheConfig .from_pybind (
581- executor_config .peft_cache_config
582- ) if executor_config .peft_cache_config is not None else PeftCacheConfig (
583- )
590+ peft_cache_config
591+ ) if peft_cache_config is not None else PeftCacheConfig ()
584592 if lora_config .max_loras is not None :
585593 peft_cache_config_model .num_device_module_layer = \
586594 max_lora_rank * num_lora_modules * lora_config .max_loras
587595 if lora_config .max_cpu_loras is not None :
588596 peft_cache_config_model .num_host_module_layer = \
589597 max_lora_rank * num_lora_modules * lora_config .max_cpu_loras
590- executor_config . peft_cache_config = peft_cache_config_model ._to_pybind ()
598+ peft_cache_config = peft_cache_config_model ._to_pybind ()
591599
592600 from tensorrt_llm .bindings import WorldConfig
593601 world_config = WorldConfig (
@@ -598,7 +606,7 @@ def create_py_executor_instance(
598606 gpus_per_node = dist .mapping .gpus_per_node ,
599607 )
600608 peft_cache_manager = PeftCacheManager (
601- peft_cache_config = executor_config . peft_cache_config ,
609+ peft_cache_config = peft_cache_config ,
602610 lora_config = lora_config ,
603611 model_config = model_binding_config ,
604612 world_config = world_config ,
@@ -609,7 +617,7 @@ def create_py_executor_instance(
609617 lora_config .trtllm_modules_to_hf_modules ,
610618 lora_config .swap_gate_up_proj_lora_b_weight )
611619
612- max_num_sequences = executor_config . max_batch_size * mapping .pp_size
620+ max_num_sequences = max_batch_size * mapping .pp_size
613621
614622 resources [ResourceManagerType .SEQ_SLOT_MANAGER ] = SeqSlotManager (
615623 max_num_sequences )
@@ -632,17 +640,16 @@ def create_py_executor_instance(
632640 scheduler_capacity ,
633641 kv_cache_manager .impl if kv_cache_manager is not None else None ,
634642 peft_cache_manager .impl if peft_cache_manager is not None else None ,
635- executor_config . scheduler_config .capacity_scheduler_policy ,
643+ scheduler_config .capacity_scheduler_policy ,
636644 two_step_lookahead = mapping .has_pp ())
637- mb_scheduler = BindMicroBatchScheduler (executor_config .max_batch_size ,
638- executor_config .max_num_tokens ,
645+ mb_scheduler = BindMicroBatchScheduler (max_batch_size , max_num_tokens ,
639646 ctx_chunk_config )
640647 scheduler = SimpleScheduler (capacity_scheduler , mb_scheduler )
641648
642649 config = model_engine .model .model_config .pretrained_config
643650 attention_type = AttentionTypeCpp .MLA if is_mla (
644651 config ) else AttentionTypeCpp .DEFAULT
645- cache_transceiver_config = executor_config . cache_transceiver_config
652+ cache_transceiver_config = cache_transceiver_config
646653 kv_cache_transceiver = create_kv_cache_transceiver (
647654 mapping , kv_cache_manager , attention_type , cache_transceiver_config )
648655 return PyExecutor (
@@ -655,16 +662,17 @@ def create_py_executor_instance(
655662 max_num_sequences = max_num_sequences ,
656663 disable_overlap_scheduler = pytorch_backend_config .
657664 disable_overlap_scheduler ,
658- max_batch_size = executor_config . max_batch_size ,
659- max_beam_width = executor_config . max_beam_width ,
665+ max_batch_size = max_batch_size ,
666+ max_beam_width = max_beam_width ,
660667 max_draft_len = spec_config .max_draft_len
661668 if spec_config is not None else 0 ,
662669 kv_cache_transceiver = kv_cache_transceiver ,
663670 guided_decoder = guided_decoder ,
664671 start_worker = start_worker ,
665672 garbage_collection_gen0_threshold = garbage_collection_gen0_threshold ,
666673 kv_connector_manager = kv_connector_manager ,
667- max_seq_len = max_seq_len )
674+ max_seq_len = max_seq_len ,
675+ peft_cache_config = peft_cache_config )
668676
669677
670678def create_torch_sampler_args (executor_config : ExecutorConfig , mapping : Mapping ,
0 commit comments