trtllm-serve + autodeploy integration

suyoggupta · suyoggupta · commit e9a1f59d7301 · 2025-08-21T14:43:59.000-07:00
Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
@@ -198,7 +198,6 @@ def prepare_flashinfer_metadata(
         flashinfer.get_seq_lens(paged_kv_indptr, paged_kv_last_page_len, page_size),
         position_ids.numel(),
     )
-
     # return metadata
     return (
         qo_indptr,
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -274,6 +274,16 @@ def quant_config(self, value: QuantConfig):
         self._quant_config = value
 
     ### VALIDATION #################################################################################
+    @field_validator("max_seq_len", mode="before")
+    @classmethod
+    def ensure_max_seq_len(cls, value: Any, info: ValidationInfo) -> Any:
+        if value is None:
+            # Fallback to the AutoDeployConfig default when not provided
+            return AutoDeployConfig.model_fields["max_seq_len"].get_default(
+                call_default_factory=True
+            )
+        return value
+
     @field_validator("build_config", mode="before")
     @classmethod
     def ensure_no_build_config(cls, value: Any, info: ValidationInfo) -> Any:
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
@@ -293,6 +293,8 @@ def _unpack(inputs) -> GenerationRequest:
             del inputs, request_list, outs
 
         del engine
+        breakpoint()
+        print("in _run_engine, after del engine")
         gc.collect()
 
     def shutdown(self):
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -12,6 +12,7 @@
 from torch.cuda import device_count
 
 from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._torch.auto_deploy.llm import LLM as AutoDeployLLM
 from tensorrt_llm import MultimodalEncoder
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import mpi_rank
@@ -109,7 +110,7 @@ def get_llm_args(model: str,
         capacity_scheduler_policy=CapacitySchedulerPolicy.GUARANTEED_NO_EVICT,
         dynamic_batch_config=dynamic_batch_config,
     )
-
+    backend = backend if backend in ["pytorch", "_autodeploy"] else None
     llm_args = {
         "model":
         model,
@@ -140,7 +141,7 @@ def get_llm_args(model: str,
         "kv_cache_config":
         kv_cache_config,
         "backend":
-        backend if backend == "pytorch" else None,
+        backend,
         "num_postprocess_workers":
         num_postprocess_workers,
         "postprocess_tokenizer_dir":
@@ -162,9 +163,12 @@ def launch_server(host: str,
 
     backend = llm_args["backend"]
     model = llm_args["model"]
-
     if backend == 'pytorch':
         llm = PyTorchLLM(**llm_args)
+    elif backend == '_autodeploy':
+        print(f"Using AutoDeploy backend with args: {llm_args}")
+        del llm_args["build_config"]
+        llm = AutoDeployLLM(**llm_args)
     else:
         llm = LLM(**llm_args)
 
@@ -205,9 +209,9 @@ def launch_mm_encoder_server(
               help="Hostname of the server.")
 @click.option("--port", type=int, default=8000, help="Port of the server.")
 @click.option("--backend",
-              type=click.Choice(["pytorch", "trt"]),
+              type=click.Choice(["pytorch", "trt", "_autodeploy"]),
               default="pytorch",
-              help="Set to 'pytorch' for pytorch path. Default is cpp path.")
+              help="Set to 'pytorch' for pytorch path and '_autodeploy' for autodeploy path. Default is pytorch path.")
 @click.option('--log_level',
               type=click.Choice(severity_map.keys()),
               default='info',

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,6 @@ def prepare_flashinfer_metadata(`
`198`	`198`	`flashinfer.get_seq_lens(paged_kv_indptr, paged_kv_last_page_len, page_size),`
`199`	`199`	`position_ids.numel(),`
`200`	`200`	`)`
`201`		`-`
`202`	`201`	`# return metadata`
`203`	`202`	`return (`
`204`	`203`	`qo_indptr,`