We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent a8b366e commit 280e06dCopy full SHA for 280e06d
docs/source/deployment-guide/quick-start-recipe-for-qwen3-next-on-trtllm.md
@@ -47,6 +47,7 @@ stream_interval: 20
47
num_postprocess_workers: 4
48
kv_cache_config:
49
enable_block_reuse: false
50
+ free_gpu_memory_fraction: 0.6
51
EOF
52
```
53
@@ -60,10 +61,10 @@ trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking \
60
61
--host 0.0.0.0 \
62
--port 8000 \
63
--backend pytorch \
- --max_batch_size 1 \
64
+ --max_batch_size 720 \
65
--max_num_tokens 4096 \
- --kv_cache_free_gpu_memory_fraction 0.6 \
66
--tp_size 4 \
67
+ --pp_size 1 \
68
--ep_size 4 \
69
--trust_remote_code \
70
--extra_llm_api_options ${EXTRA_LLM_API_FILE}
0 commit comments