Skip to content

Commit e9a1f59

Browse files
committed
trtllm-serve + autodeploy integration
Signed-off-by: Suyog Gupta <[email protected]>
1 parent e5e4170 commit e9a1f59

File tree

4 files changed

+21
-6
lines changed

4 files changed

+21
-6
lines changed

tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,6 @@ def prepare_flashinfer_metadata(
198198
flashinfer.get_seq_lens(paged_kv_indptr, paged_kv_last_page_len, page_size),
199199
position_ids.numel(),
200200
)
201-
202201
# return metadata
203202
return (
204203
qo_indptr,

tensorrt_llm/_torch/auto_deploy/llm_args.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,16 @@ def quant_config(self, value: QuantConfig):
274274
self._quant_config = value
275275

276276
### VALIDATION #################################################################################
277+
@field_validator("max_seq_len", mode="before")
278+
@classmethod
279+
def ensure_max_seq_len(cls, value: Any, info: ValidationInfo) -> Any:
280+
if value is None:
281+
# Fallback to the AutoDeployConfig default when not provided
282+
return AutoDeployConfig.model_fields["max_seq_len"].get_default(
283+
call_default_factory=True
284+
)
285+
return value
286+
277287
@field_validator("build_config", mode="before")
278288
@classmethod
279289
def ensure_no_build_config(cls, value: Any, info: ValidationInfo) -> Any:

tensorrt_llm/_torch/auto_deploy/shim/demollm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ def _unpack(inputs) -> GenerationRequest:
293293
del inputs, request_list, outs
294294

295295
del engine
296+
breakpoint()
297+
print("in _run_engine, after del engine")
296298
gc.collect()
297299

298300
def shutdown(self):

tensorrt_llm/commands/serve.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from torch.cuda import device_count
1313

1414
from tensorrt_llm import LLM as PyTorchLLM
15+
from tensorrt_llm._torch.auto_deploy.llm import LLM as AutoDeployLLM
1516
from tensorrt_llm import MultimodalEncoder
1617
from tensorrt_llm._tensorrt_engine import LLM
1718
from tensorrt_llm._utils import mpi_rank
@@ -109,7 +110,7 @@ def get_llm_args(model: str,
109110
capacity_scheduler_policy=CapacitySchedulerPolicy.GUARANTEED_NO_EVICT,
110111
dynamic_batch_config=dynamic_batch_config,
111112
)
112-
113+
backend = backend if backend in ["pytorch", "_autodeploy"] else None
113114
llm_args = {
114115
"model":
115116
model,
@@ -140,7 +141,7 @@ def get_llm_args(model: str,
140141
"kv_cache_config":
141142
kv_cache_config,
142143
"backend":
143-
backend if backend == "pytorch" else None,
144+
backend,
144145
"num_postprocess_workers":
145146
num_postprocess_workers,
146147
"postprocess_tokenizer_dir":
@@ -162,9 +163,12 @@ def launch_server(host: str,
162163

163164
backend = llm_args["backend"]
164165
model = llm_args["model"]
165-
166166
if backend == 'pytorch':
167167
llm = PyTorchLLM(**llm_args)
168+
elif backend == '_autodeploy':
169+
print(f"Using AutoDeploy backend with args: {llm_args}")
170+
del llm_args["build_config"]
171+
llm = AutoDeployLLM(**llm_args)
168172
else:
169173
llm = LLM(**llm_args)
170174

@@ -205,9 +209,9 @@ def launch_mm_encoder_server(
205209
help="Hostname of the server.")
206210
@click.option("--port", type=int, default=8000, help="Port of the server.")
207211
@click.option("--backend",
208-
type=click.Choice(["pytorch", "trt"]),
212+
type=click.Choice(["pytorch", "trt", "_autodeploy"]),
209213
default="pytorch",
210-
help="Set to 'pytorch' for pytorch path. Default is cpp path.")
214+
help="Set to 'pytorch' for pytorch path and '_autodeploy' for autodeploy path. Default is pytorch path.")
211215
@click.option('--log_level',
212216
type=click.Choice(severity_map.keys()),
213217
default='info',

0 commit comments

Comments
 (0)