NVIDIA · QiJune · Jun 24, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -243,9 +243,7 @@ def forward(
         return {"logits": logits_flat}
 
 
-def create_autodeploy_executor(
-    executor_config: ExecutorConfig, checkpoint_dir: str = None, engine_dir: str = None
-):
+def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir: str = None):
     """Create an AutoDeploy executor from the given configuration and checkpoint directory.
 
     This is the entrypoint API to the _autodeploy backend.

@@ -95,7 +95,6 @@ class PyTorchConfig:
     'tokens_per_block',
     'mapping',
     'hf_model_dir',
-    'trt_engine_dir',
 ]
 
 
@@ -107,7 +106,6 @@ def update_executor_config(
         build_config: Optional[BuildConfig] = None,
         speculative_config: Optional[SpecConfig] = None,
         hf_model_dir: Optional[str] = None,
-        trt_engine_dir: Optional[str] = None,
         max_input_len: Optional[int] = None,
         max_seq_len: Optional[int] = None):
     if backend is None:
@@ -131,7 +129,6 @@ def update_executor_config(
     executor_config.tokens_per_block = executor_config.tokens_per_block or build_config.plugin_config.tokens_per_block
 
     executor_config.hf_model_dir = hf_model_dir
-    executor_config.trt_engine_dir = trt_engine_dir
 
     if max_input_len is not None:
         executor_config.max_input_len = max_input_len

@@ -282,8 +282,6 @@ def __init__(
         self.is_cuda_graph_dummy = False
         self.py_lora_task_layer_module_configs = None
 
-        self.py_tokens = super().get_tokens()
-
         self.py_return_log_probs = return_log_probs
         self.py_return_context_logits = return_context_logits
         self.py_return_generation_logits = return_generation_logits

@@ -180,7 +180,6 @@ def _get_mapping(executor_config: ExecutorConfig) -> Mapping:
 def create_py_executor(
         executor_config: ExecutorConfig,
         checkpoint_dir: str = None,
-        engine_dir: str = None,
         lora_config: Optional[LoraConfig] = None,
         garbage_collection_gen0_threshold: Optional[int] = None) -> PyExecutor:
     _mangle_executor_config(executor_config)

@@ -119,7 +119,6 @@ def _create_engine():
             args = {
                 "executor_config": executor_config,
                 "checkpoint_dir": executor_config.hf_model_dir,
-                "engine_dir": executor_config.trt_engine_dir,
             }
             if executor_config.backend == "pytorch":
                 from tensorrt_llm._torch.pyexecutor.py_executor_creator import \
@@ -135,7 +134,6 @@ def _create_engine():
             else:
                 raise ValueError(
                     f"Unsupported backend config: {executor_config.backend}")
-
             return create_executor(**args)
 
         self.engine = _create_engine()

@@ -688,7 +688,6 @@ def _build_model(self):
             if self._on_trt_backend else None,
             speculative_config=self.args.speculative_config,
             hf_model_dir=self._hf_model_dir,
-            trt_engine_dir=self._engine_dir,
             max_input_len=self.args.max_input_len,
             max_seq_len=max_seq_len)
         self._executor_config.llm_parallel_config = self.args.parallel_config