From 74e28d1d9a6b4f60290dfdb2fa168956a01b5853 Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Thu, 30 Apr 2026 17:04:10 +0300 Subject: [PATCH 1/6] [Perf] Warmup forward_native sampler kernel Signed-off-by: Artem Perevedentsev --- vllm/v1/worker/gpu_model_runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e2af34eecb96..f6a46b355238 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5664,6 +5664,14 @@ def _dummy_sampler_run( sampler_output = self.sampler( logits=logits, sampling_metadata=dummy_metadata ) + # Also warm forward_native (taken when generators dict is non-empty). + self.sampler( + logits=logits, + sampling_metadata=replace( + dummy_metadata, + generators={0: torch.Generator(device=self.device).manual_seed(0)}, + ), + ) except RuntimeError as e: if "out of memory" in str(e): raise RuntimeError( From bd90a871212f01776ac6720258b31a1c0db58c02 Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Thu, 30 Apr 2026 17:22:56 +0300 Subject: [PATCH 2/6] Update vllm/v1/worker/gpu_model_runner.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Artem Perevedentsev --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f6a46b355238..6f289317068a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5662,11 +5662,11 @@ def _dummy_sampler_run( ) try: sampler_output = self.sampler( - logits=logits, sampling_metadata=dummy_metadata + logits=logits.clone(), sampling_metadata=dummy_metadata ) # Also warm forward_native (taken when generators dict is non-empty). self.sampler( - logits=logits, + logits=logits.clone(), sampling_metadata=replace( dummy_metadata, generators={0: torch.Generator(device=self.device).manual_seed(0)}, From 35b8d4f5e71468a235ca3caa1337ccedd731e9cc Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Fri, 1 May 2026 10:38:41 +0300 Subject: [PATCH 3/6] [Bugfix] Skip redundant native sampler warmup when forward is already native Signed-off-by: Artem Perevedentsev --- vllm/v1/worker/gpu_model_runner.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 02ede5915d29..c07245118e17 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5667,14 +5667,27 @@ def _dummy_sampler_run( sampler_output = self.sampler( logits=logits.clone(), sampling_metadata=dummy_metadata ) - # Also warm forward_native (taken when generators dict is non-empty). - self.sampler( - logits=logits.clone(), - sampling_metadata=replace( - dummy_metadata, - generators={0: torch.Generator(device=self.device).manual_seed(0)}, - ), - ) + # Also warm forward_native (taken when generators dict is non-empty), + # but only when the main forward path is not already native — in + # that case the extra call is redundant and inflates peak memory + # during profile_run (e.g. logprobs_mode='processed_logprobs', + # where TopKTopPSampler binds forward = forward_native). + # Compare underlying functions: bound methods are recreated on each + # attribute access, so `forward is forward_native` is always False. + topk_topp_sampler = self.sampler.topk_topp_sampler + if ( + topk_topp_sampler.forward.__func__ + is not topk_topp_sampler.forward_native.__func__ + ): + self.sampler( + logits=logits.clone(), + sampling_metadata=replace( + dummy_metadata, + generators={ + 0: torch.Generator(device=self.device).manual_seed(0) + }, + ), + ) except RuntimeError as e: if "out of memory" in str(e): raise RuntimeError( From 64f9108e4378696106e93cd662b9fba9099bb1ba Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Fri, 1 May 2026 11:04:05 +0300 Subject: [PATCH 4/6] Use logprobs_mode in sampler warmup conditional (mypy fix) Signed-off-by: Artem Perevedentsev --- vllm/v1/worker/gpu_model_runner.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a282baab89da..c83b00e89aa9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5702,16 +5702,13 @@ def _dummy_sampler_run( logits=logits.clone(), sampling_metadata=dummy_metadata ) # Also warm forward_native (taken when generators dict is non-empty), - # but only when the main forward path is not already native — in - # that case the extra call is redundant and inflates peak memory - # during profile_run (e.g. logprobs_mode='processed_logprobs', - # where TopKTopPSampler binds forward = forward_native). - # Compare underlying functions: bound methods are recreated on each - # attribute access, so `forward is forward_native` is always False. - topk_topp_sampler = self.sampler.topk_topp_sampler - if ( - topk_topp_sampler.forward.__func__ - is not topk_topp_sampler.forward_native.__func__ + # but skip the extra call in 'processed_logits' / 'processed_logprobs' + # modes — there TopKTopPSampler binds forward = forward_native at + # init time, so the warmup call is redundant and only inflates peak + # memory during profile_run. + if self.sampler.logprobs_mode not in ( + "processed_logits", + "processed_logprobs", ): self.sampler( logits=logits.clone(), From 787bdbfd2ab799b6da130a9341259a1280614e4d Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Fri, 1 May 2026 11:41:25 +0300 Subject: [PATCH 5/6] Trigger CI Signed-off-by: Artem Perevedentsev From 0bab818e10999503dfe59309035b2cae73a699d5 Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Fri, 1 May 2026 13:27:33 +0300 Subject: [PATCH 6/6] [Bugfix] Drop logits.clone() in dummy sampler run to fix processed_logprobs OOM Signed-off-by: Artem Perevedentsev --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c83b00e89aa9..bcab2ca2d4c2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5699,19 +5699,21 @@ def _dummy_sampler_run( ) try: sampler_output = self.sampler( - logits=logits.clone(), sampling_metadata=dummy_metadata + logits=logits, sampling_metadata=dummy_metadata ) # Also warm forward_native (taken when generators dict is non-empty), # but skip the extra call in 'processed_logits' / 'processed_logprobs' # modes — there TopKTopPSampler binds forward = forward_native at # init time, so the warmup call is redundant and only inflates peak # memory during profile_run. + # No .clone() of logits: warmup output is discarded, so any in-place + # mutation by forward_native does not affect correctness. if self.sampler.logprobs_mode not in ( "processed_logits", "processed_logprobs", ): self.sampler( - logits=logits.clone(), + logits=logits, sampling_metadata=replace( dummy_metadata, generators={