File tree Expand file tree Collapse file tree 5 files changed +25
-16
lines changed 
examples/wide_ep/slurm_scripts 
tests/integration/test_lists Expand file tree Collapse file tree 5 files changed +25
-16
lines changed Original file line number Diff line number Diff line change 11#  This file defines code ownership rules for the repository.
22
3- #  The following rule should only be uncommented on release branches (e.g., release/0.19).
4- #  The rule below requires that any PR to release/**/* branches must be approved by at least one member
5- #  of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
6- #  Without approval from a member of this team, PRs cannot be merged to release branches.
7- *  @ NVIDIA/trt-llm-release-branch-approval 
83
94#  TensorRT-LLM Pytorch backend
105/tensorrt_llm /_torch  @ NVIDIA/trt-llm-torch-devs 
@@ -155,3 +150,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
155150#  from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
156151/tests /unittest /api_stability / @ NVIDIA/trt-llm-noncommitted-api-review-committee 
157152/tests /unittest /api_stability /references_committed / @ NVIDIA/trt-llm-committed-api-review-committee 
153+ 
154+ #  The following rule should only be uncommented on release branches (e.g., release/0.19).
155+ #  The rule below requires that any PR to release/**/* branches must be approved by at least one member
156+ #  of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
157+ #  Without approval from a member of this team, PRs cannot be merged to release branches.
158+ *  @ NVIDIA/trt-llm-release-branch-approval 
Original file line number Diff line number Diff line change @@ -11,7 +11,7 @@ workdir=<workdir>  # Path to disaggr_torch.slurm
1111model_dir=< model_dir>   #  Path to the model checkpoint
1212
1313mtp_size=0
14- ntasks_per_node=4 #  4 GPUs per GB200 node
14+ ntasks_per_node=4 #  4 GPUs per GB200 node, 8 GPUs per B200 node 
1515
1616isl=1024
1717osl=1024
@@ -22,8 +22,9 @@ streaming=true
2222for  b  in  1 64 1024;  do 
2323    for  eplb_num_slots  in  0 256 288;  do 
2424        concurrency=$(( b *  16 )) 
25-         ctx_num=$(( (concurrency +  5499 )/ 5500 )) 
26-         total_node_num=$(( ctx_num +  4 )) 
25+         ctx_node_num=$(( (concurrency +  5499 )/ 5500 )) #  $(((concurrency + 10999)/11000)) for B200
26+         ctx_num=${ctx_node_num}  #  $((ctx_node_num * 2)) for B200
27+         total_node_num=$(( ctx_node_num +  4 )) #  $((ctx_node_num + 2)) for B200
2728        ntasks=$(( total_node_num *  ntasks_per_node)) 
2829
2930        args=(
5657#  dep32 eplb288
5758for  b  in  512;  do 
5859    concurrency=$(( b *  32 )) 
59-     ctx_num=$(( (concurrency +  5499 )/ 5500 )) 
60-     total_node_num=$(( ctx_num +  8 )) 
60+     ctx_node_num=$(( (concurrency +  5499 )/ 5500 )) #  $(((concurrency + 10999)/11000)) for B200
61+     ctx_num=${ctx_node_num}  #  $((ctx_node_num * 2)) for B200
62+     total_node_num=$(( ctx_node_num +  8 )) #  $((ctx_node_num + 4)) for B200
6163    ntasks=$(( total_node_num *  ntasks_per_node)) 
6264    eplb_num_slots=288
6365
Original file line number Diff line number Diff line change @@ -182,6 +182,9 @@ def _forward_nope(
182182                                        attention_mask = attention_mask ,
183183                                        mrope_config = mrope_config )
184184
185+         if  isinstance (attn_output , tuple ):
186+             attn_output  =  Fp4QuantizedTensor (attn_output [0 ], attn_output [1 ])
187+ 
185188        attn_output  =  self .o_proj (attn_output ,
186189                                  all_reduce_params = all_reduce_params )
187190
Original file line number Diff line number Diff line change 234234    help = "Path where per request information is written to." , 
235235) 
236236@optgroup .option ( 
237-     "--enable_chunked_context/--disable_chunked_context" , 
238-     default = True , 
239-     help =  
240-     "Enable/disable chunking in prefill stage for enhanced throughput benchmark. "  
237+     "--enable_chunked_context" , 
238+     is_flag = True , 
239+     default = None , 
240+     help = "Enable chunking in prefill stage for enhanced throughput benchmark. "  
241+     "Default is False for PyTorch/AutoDeploy backend, True for TensorRT backend." , 
241242) 
242243@optgroup .option ( 
243244    "--scheduler_policy" , 
@@ -348,8 +349,11 @@ def throughput_command(
348349    kv_cache_percent  =  params .get ("kv_cache_free_gpu_mem_fraction" )
349350    beam_width  =  params .get ("beam_width" )
350351    streaming : bool  =  params .get ("streaming" )
351-     enable_chunked_context : bool  =  params .get ("enable_chunked_context" )
352352    scheduler_policy : str  =  params .get ("scheduler_policy" )
353+     enable_chunked_context : bool  =  params .get ("enable_chunked_context" )
354+     if  enable_chunked_context  is  None :
355+         # Set default based on backend: True for TensorRT, False for others 
356+         enable_chunked_context  =  backend .lower () ==  "tensorrt" 
353357
354358    # Update configuration with runtime options 
355359    exec_settings ["settings_config" ]["kv_cache_percent" ] =  kv_cache_percent 
Original file line number Diff line number Diff line change @@ -243,7 +243,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-re
243243examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
244244accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414)
245245test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)
246- test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
247246llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5410399)
248247unittest/trt/attention/test_gpt_attention.py -k "partition0" SKIP (https://nvbugs/5412456)
249248unittest/trt/attention/test_gpt_attention.py -k "partition1" SKIP (https://nvbugs/5412456)
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments