diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index 65423e3f8ed..00f04a1d0a1 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -125,12 +125,13 @@ def _configure_affinity(self, device_id): Note: If the process already has constrained affinity, a warning is logged. Configuration is handled as follows: - TLLM_NUMA_WORKER_AFFINITY = - -> affinity is auto-configured only if it is unconstrained - TLLM_NUMA_WORKER_AFFINITY = 1 - -> affinity is unconditionally auto-configured - TLLM_NUMA_WORKER_AFFINITY = 0 or any other value - -> affinity is unconditionally _not_ auto-configured + TLLM_NUMA_AWARE_WORKER_AFFINITY = + -> Affinity is automatically configured if it is unconstrained, + and deleted if it is constrained externally by the user. + TLLM_NUMA_AWARE_WORKER_AFFINITY = 1 + -> Affinity is unconditionally auto-configured. + TLLM_NUMA_AWARE_WORKER_AFFINITY = 0 or any other value + -> Affinity is unconditionally _not_ auto-configured. ''' # Get the current affinity setting @@ -141,22 +142,31 @@ def _configure_affinity(self, device_id): all_cpus = list(range(psutil.cpu_count())) constrained_affinity = (cpu_affinity != all_cpus) + numa_aware_affinity = os.environ.get("TLLM_NUMA_AWARE_WORKER_AFFINITY") - # If the process is affined to a constrained set of CPUs, warn the user - # so as to ensure that this is what is intended + # If affinity is constrained but the user hasn't explicitly + # requested NUMA-aware affinity, remove the constraints. if constrained_affinity: logger.warning( f"Worker process {pid} is affined to run on the following CPUs: " f"{cpu_affinity} (subset of all logical CPUs). This may harm " f"performance if set incorrectly.") + if numa_aware_affinity is None: + logger.warning( + f"Worker process {pid} has constrained CPU affinity " + f"but `TLLM_NUMA_AWARE_WORKER_AFFINITY` is not set. " + f"Removing CPU affinity constraints.") + process.cpu_affinity(all_cpus) # If affinity is unconstrained and the user hasn't explicitly # prohibited it or the user has explicitly requested it, choose the # optimal affinity based upon the NUMA topology - numa_aware_affinity = os.environ.get("TLLM_NUMA_AWARE_WORKER_AFFINITY") if ((numa_aware_affinity is None and not constrained_affinity) or (numa_aware_affinity == "1")): process.cpu_affinity(get_numa_aware_cpu_affinity(device_id)) + logger.info( + f"Worker process {pid} CPU affinity set to " + f"{process.cpu_affinity()} for optimal NUMA-aware scheduling.") def _get_comm_ranks_device_id(self): device_id = self.global_rank % torch.cuda.device_count() diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 321501038f3..ce9f3efe912 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -95,7 +95,8 @@ def launch_disaggregated_llm( ctx_model: str = None, gen_model: str = None, server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT, - max_workers: int = 16): + max_workers: int = 16, + enable_perf=False): temp_dir = tempfile.TemporaryDirectory() disaggregated_serving_config_path = os.path.join( temp_dir.name, "disaggregated_serving_config.yaml") @@ -104,9 +105,7 @@ def launch_disaggregated_llm( print( f"Using unified tp parameter for testing is not recommended. Please use server configs instead." ) - - enable_perf = True - perf_max_requests = 10000 + perf_max_requests = 50 def _apply_perf_flags(cfg: Optional[Dict[str, Any]]): if not isinstance(cfg, dict): @@ -120,6 +119,7 @@ def _apply_perf_flags(cfg: Optional[Dict[str, Any]]): _apply_perf_flags(disaggregated_server_config) _apply_perf_flags(ctx_server_config) _apply_perf_flags(gen_server_config) + disaggregated_server_config = revise_disaggregated_server_config_urls_with_free_ports( disaggregated_server_config) @@ -366,7 +366,7 @@ def _get_perf_metrics(): except requests.exceptions.RequestException as e: print(f"Error fetching {perf_url}: {e}") - def _show_kvcache_time(kv_cache_perf_dir, max_lines=1000): + def _show_kvcache_time(kv_cache_perf_dir, max_lines=100): print(f"kv_cache_perf_dir: {kv_cache_perf_dir}") for file in os.listdir(kv_cache_perf_dir): print(f"file: {file}") @@ -475,9 +475,6 @@ def test_auto_dtype(self, disable_overlap_scheduler, ctx_enable_block_reuse, "disable_overlap_scheduler": disable_overlap_scheduler, "kv_cache_config": { "enable_block_reuse": gen_enable_block_reuse - }, - "cache_transceiver_config": { - "backend": "DEFAULT" } } gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 557e5a87cc2..6d321550994 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -349,8 +349,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-b accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824) -accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854) -accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) @@ -370,10 +368,7 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] SKIP unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] SKIP (https://nvbugs/5664904) test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] SKIP (https://nvbugs/5670469) test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-True] SKIP (https://nvbugs/5670480) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] SKIP (https://nvbugs/5670480) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-True-False] SKIP (https://nvbugs/5670480) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5673578) examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826) @@ -393,7 +388,6 @@ unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_ test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153) accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438) disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5688388) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-True-True] SKIP (https://nvbugs/5670480) accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721) unittest/_torch/speculative/test_eagle3.py::test_llama_eagle3[True-FLASHINFER-False-False-False-False-True-False-False] SKIP (https://nvbugs/5691246) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5698897)