diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index 086696735bf..dd8705ce629 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -76,6 +76,7 @@ worker_config: - 2048 - 256 print_iter_log: true + trust_remote_code: true kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.8 @@ -102,6 +103,7 @@ worker_config: enable_attention_dp: true pipeline_parallel_size: 1 print_iter_log: true + trust_remote_code: true cuda_graph_config: null disable_overlap_scheduler: true kv_cache_config: diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh index 3651082d656..457548a34e8 100644 --- a/examples/disaggregated/slurm/benchmark/run_benchmark.sh +++ b/examples/disaggregated/slurm/benchmark/run_benchmark.sh @@ -52,6 +52,7 @@ for concurrency in ${concurrency_list}; do --dataset-path ${dataset_file} \ --num-prompts ${num_prompts} \ --max-concurrency ${concurrency} \ + --trust-remote-code \ --ignore-eos \ --no-test-input \ --save-result \ diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh index 0efce0c52ce..1e2943b8ebd 100644 --- a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh +++ b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh @@ -75,6 +75,7 @@ for concurrency in ${concurrency_list}; do --dataset-name random \ --num-prompts "${num_prompts}" \ --max-concurrency "${concurrency}" \ + --trust-remote-code \ --ignore-eos \ --random-input-len "${input_seq_len}" \ --random-output-len "${output_seq_len}" \ diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 476a5a33f9d..37efd0764b3 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -105,8 +105,13 @@ def submit_job(config): log_base = os.path.join(env_config['work_dir'], f"{isl}-{osl}") # Get eplb num_slots for gen worker - eplb_num_slots = (config['worker_config']['gen'].get('moe_config', {}).get( - 'load_balancer', {}).get('num_slots', 0)) + load_balancer_config = config['worker_config']['gen'].get( + 'moe_config', {}).get('load_balancer', {}) + if isinstance(load_balancer_config, str): + with open(load_balancer_config, 'r') as f: + load_balancer_config = yaml.safe_load(f) + eplb_num_slots = load_balancer_config.get('num_slots', 0) + # Determine directory suffix based on attention_dp if gen_enable_attention_dp: dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}" diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index d47743cf8f0..4a891c3755e 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -761,7 +761,6 @@ def _handle_response(self, response: "GenerationExecutor.Response"): beam_output.finish_reason = 'stop' beam_output.stop_reason = stop_reason - self.abort() self._done = True break