From c08dc0a93ca66069069b0f761483bc474ebb0299 Mon Sep 17 00:00:00 2001 From: sourashis Date: Sun, 18 Aug 2024 02:14:21 +0000 Subject: [PATCH 01/38] Add cuda graph support during decoding for encoder-decoder models --- tests/models/test_bart.py | 8 +++-- vllm/attention/backends/utils.py | 3 ++ vllm/attention/ops/paged_attn.py | 6 ++++ vllm/config.py | 4 +-- vllm/model_executor/models/bart.py | 32 +++++++++++++---- vllm/worker/enc_dec_model_runner.py | 39 +++++++++++++++++--- vllm/worker/model_runner.py | 55 +++++++++++++++++++++++++++-- vllm/worker/utils.py | 6 ++-- 8 files changed, 131 insertions(+), 22 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 9bca5a86f1241..9bddcab971ffa 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -4,6 +4,8 @@ """ from typing import List, Optional, Tuple +from transformers import TransfoXLForSequenceClassification + from vllm.utils import is_cpu if not is_cpu(): @@ -35,7 +37,7 @@ def vllm_to_hf_output( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) - @pytest.mark.parametrize("max_tokens", [64]) + @pytest.mark.parametrize("max_tokens", [2048]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) def test_models( @@ -150,9 +152,11 @@ def test_models( # decoder-only unit tests expect), so when testing an encoder/decoder # model we must explicitly specify enforce_eager=True in the VllmRunner # constructor. - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + #print('test_case_prompts ' + str(test_case_prompts)) + with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) + print('vllm_outputs ' + str(vllm_outputs)) hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE else 0) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index e6b5f820c5fa0..91f7e62a3259e 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -195,6 +195,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], -1 if cuda graph is not used. batch_size: The maybe padded batch size. """ + print('In build') for inter_data in self.input_builder.inter_data_list: self._add_seq_group(inter_data, self.input_builder.chunked_prefill_enabled) @@ -206,8 +207,10 @@ def build(self, seq_lens: List[int], query_lens: List[int], max_prefill_seq_len = max(self.prefill_seq_lens, default=0) max_decode_seq_len = max(self.curr_seq_lens, default=0) num_decode_tokens = self.num_decode_tokens + if use_captured_graph: + print('cuda_graph_pad_size ' + str(cuda_graph_pad_size)) self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) self.block_tables.extend([] * cuda_graph_pad_size) num_decode_tokens = batch_size diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 92023d5b75f5a..37ffc9de12cef 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -116,6 +116,12 @@ def forward_decode( num_seqs, num_heads, head_size = query.shape max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE) + #print('max_seq_len ' + str(max_seq_len)) + #print('max_num_partitions ' + str(max_num_partitions)) + #print('num_seqs ' + str(num_seqs)) + #print('num_heads ' + str(num_heads)) + #print('seq_lens ' + str(seq_lens)) + #print('block_tables ' + str(block_tables)) # NOTE(woosuk): We use a simple heuristic to decide whether to use # PagedAttention V1 or V2. If the number of partitions is 1, we use # V1 to avoid the overhead of reduction. Also, if the number of diff --git a/vllm/config.py b/vllm/config.py index 809d6370763dc..839aaaafc8d72 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -180,11 +180,11 @@ def __init__( "decoder models.") self.enforce_eager = True - if not self.enforce_eager: + #if not self.enforce_eager: # Eager mode explicitly disabled by user for an encoder/ # decoder model; however CUDAGRAPH + encoder/decoder is # not currently supported - raise ValueError(STR_NOT_IMPL_ENC_DEC_CUDAGRAPH) + # raise ValueError(STR_NOT_IMPL_ENC_DEC_CUDAGRAPH) elif self.enforce_eager is None: # *Only for decoder-only models*, enforce_eager # defaults to False if unset. This is intuitive diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 5066e991f9003..ee67dab23fd75 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -353,6 +353,8 @@ def forward( # (afeldman-nm 2024/07/22) TODO: # Need a more efficient solution for q/k/v + #print('decoder_hidden_states.shape() ' + str(decoder_hidden_states.shape)) + #print('encoder_hidden_states.shape() ' + str(encoder_hidden_states.shape)) qkv_dec, _ = self.qkv_proj(decoder_hidden_states) q, _, _ = qkv_dec.split([self.q_size, self.kv_size, self.kv_size], dim=-1) @@ -363,15 +365,18 @@ def forward( qkv_enc, _ = self.qkv_proj(encoder_hidden_states) _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - + #print('In 10') + #print('Cross Block Table shape ' + str(attn_metadata.cross_block_tables.shape)) + #print('Cross Block Table content ' + str(attn_metadata.cross_block_tables)) attn_output = self.attn(q, k, v, kv_cache, attn_metadata, attn_type=AttentionType.ENCODER_DECODER) - + #print('In 11') output, _ = self.out_proj(attn_output) + #print('In 12') return output @@ -528,18 +533,25 @@ def forward( Decoder layer output torch.Tensor """ residual = decoder_hidden_states + #print('In 4') + #print('Decoder Block Table shape ' + str(attn_metadata.block_tables.shape)) + #print('Decoder Block Table content ' + str(attn_metadata.block_tables)) + # Self Attention hidden_states = self.self_attn(hidden_states=decoder_hidden_states, kv_cache=kv_cache, attn_metadata=attn_metadata) + #print('In 5') hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) - + #print('In 6') + # Cross-Attention Block residual = hidden_states + #print('In 7') hidden_states = self.encoder_attn( decoder_hidden_states=hidden_states, @@ -547,6 +559,7 @@ def forward( attn_metadata=attn_metadata, encoder_hidden_states=encoder_hidden_states, ) + #print('In 8') hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -561,6 +574,8 @@ def forward( hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) + #print('In 9') + return hidden_states @@ -712,7 +727,7 @@ def forward(self, decoder_input_ids: torch.Tensor, Returns: Decoder output torch.Tensor """ - + #print('In 1') inputs_embeds = self.embed_tokens(decoder_input_ids) # embed positions @@ -725,6 +740,7 @@ def forward(self, decoder_input_ids: torch.Tensor, hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) + #print('In 2') # decoder layers for idx, decoder_layer in enumerate(self.layers): @@ -734,7 +750,7 @@ def forward(self, decoder_input_ids: torch.Tensor, attn_metadata=attn_metadata, encoder_hidden_states=encoder_hidden_states, ) - + #print('In 3') return hidden_states @@ -846,11 +862,13 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - encoder_input_ids: torch.Tensor, - encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + *, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + **kwargs, ) -> torch.Tensor: r""" Args: diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 4e66a04674c2a..7720a6b6c1db3 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -1,6 +1,6 @@ import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, cast - +import itertools import torch import torch.distributed @@ -21,7 +21,7 @@ from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad from vllm.worker.model_runner import (_PAD_SLOT_ID, GPUModelRunnerBase, ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata) + ModelInputForGPUWithSamplingMetadata, _get_graph_batch_size) from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, _add_sampling_metadata_broadcastable_dict) @@ -174,7 +174,22 @@ def execute_model( raise ValueError("num_steps > 1 is not supported in " "EncoderDecoderModelRunner") - model_executable = self.model + if (model_input.attn_metadata.prefill_metadata is None and + model_input.attn_metadata.decode_metadata.use_cuda_graph): + print('Executing as cuda graph') + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = self.graph_runners[model_input.virtual_engine][ + graph_batch_size] + print('encoder_seq_lens_tensor.shape ' + str( + model_input.attn_metadata.encoder_seq_lens_tensor.shape)) + + else: + print('Executing without cuda graph') + model_executable = self.model + + + #model_executable = self.model seqlen_agnostic_kwargs = { "finished_requests_ids": model_input.finished_requests_ids, @@ -211,6 +226,7 @@ def make_model_input_from_broadcasted_tensor_dict( attn_backend=self.attn_backend, ) + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -428,10 +444,23 @@ def _prepare_encoder_model_input_tensors( cross_block_table is None) else cross_block_table) # Convert cross-attention block tables to encoder input tensor + if model_input.attn_metadata.decode_metadata.use_cuda_graph: + #max_len = self.max_seq_len_to_capture + max_len = self.get_max_block_per_batch() + batch_size = len(encoder_seq_lens) + graph_batch_size = _get_graph_batch_size(batch_size) + assert graph_batch_size >= batch_size + cuda_graph_pad_size = graph_batch_size - batch_size + print('cuda_graph_pad_size-1' + str(cuda_graph_pad_size)) + cross_block_tables.extend([[] for _ in range(cuda_graph_pad_size)]) + encoder_seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) + + else: + max_len = max(len(block_table) for block_table in cross_block_tables) + cross_block_tables = make_tensor_with_pad( cross_block_tables, - max_len=max( - len(block_table) for block_table in cross_block_tables), + max_len=max_len, pad=0, dtype=torch.int32, device=self.device, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index cfbbb6698cd8a..3b485a86c211d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -641,6 +641,9 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): def _use_captured_graph(self, batch_size: int, max_decode_seq_len: int) -> bool: + print('batch_size ' + str(batch_size)) + print('max_decode_seq_len ' + str(max_decode_seq_len)) + print('max_seq_len_to_capture ' + str(self.runner.max_seq_len_to_capture)) return (self.decode_only and not self.runner.model_config.enforce_eager and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] and max_decode_seq_len <= self.runner.max_seq_len_to_capture) @@ -686,6 +689,7 @@ def build(self) -> ModelInputForGPU: batch_size = len(input_tokens) use_captured_graph = self._use_captured_graph(batch_size, max_decode_seq_len) + #use_captured_graph = True # If cuda graph can be used, pad tensors accordingly. # See `capture_model` API for more details. @@ -697,6 +701,8 @@ def build(self) -> ModelInputForGPU: cuda_graph_pad_size = graph_batch_size - batch_size batch_size = graph_batch_size + #print('cuda_graph_pad_size ' + str(cuda_graph_pad_size)) + #print('use_captured_graph ' + str(use_captured_graph)) # Tokens and positions. if cuda_graph_pad_size: input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) @@ -1209,6 +1215,11 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: slot_mapping.fill_(_PAD_SLOT_ID) seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() block_tables = torch.from_numpy(self.graph_block_tables).cuda() + cross_block_tables = torch.from_numpy(np.zeros( + (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), + dtype=np.int32)).cuda() + cross_slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda() + cross_slot_mapping.fill_(_PAD_SLOT_ID) intermediate_inputs = None if not get_pp_group().is_first_rank: intermediate_inputs = self.model.make_empty_intermediate_tensors( @@ -1308,6 +1319,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: prefill_wrapper=None) attn_metadata.begin_forward() else: + #print('Cross Block Tables shape ' + str(cross_block_tables.shape)) attn_metadata = self.attn_backend.make_metadata( num_prefills=0, num_prefill_tokens=0, @@ -1322,6 +1334,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: seq_start_loc=None, context_lens_tensor=None, block_tables=block_tables[:batch_size], + cross_slot_mapping=torch.tensor([], dtype=torch.int).cuda(), + cross_block_tables=cross_block_tables[:batch_size], + #cross_block_tables=None, + encoder_seq_lens=torch.full((batch_size,), self.max_seq_len_to_capture, dtype=torch.int).cuda(), + encoder_seq_lens_tensor=torch.full((batch_size,), self.max_seq_len_to_capture, dtype=torch.int).cuda(), + max_encoder_seq_len=self.max_seq_len_to_capture, use_cuda_graph=True, ) @@ -1374,7 +1392,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: "memory_pool": self.graph_memory_pool, "stream": - graph_capture_context.stream + graph_capture_context.stream, + "encoder_input_ids": torch.tensor([], dtype=torch.long).cuda(), + "encoder_positions": torch.tensor([], dtype=torch.long).cuda(), } if self.has_seqlen_agnostic: # Only used by Mamba-based models CUDA graph atm (Jamba) @@ -1634,7 +1654,8 @@ def capture( # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). # Note one iteration is not enough for torch.jit.script - for _ in range(_NUM_WARMUP_ITERS): + for i in range(_NUM_WARMUP_ITERS): + #print('Runnng ' + str(i)) self.model( input_ids, positions, @@ -1644,10 +1665,11 @@ def capture( **kwargs, ) torch.cuda.synchronize() - + #print('Done Runnng') # Capture the graph. self._graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): + #print('Here') output_hidden_or_intermediate_states = self.model( input_ids, positions, @@ -1684,6 +1706,8 @@ def capture( **kwargs, } else: + print('encoder_seq_lens_tensor.shape ' + str( + attn_metadata.decode_metadata.encoder_seq_lens_tensor.shape)) self.input_buffers = { "input_ids": input_ids, "positions": positions, @@ -1692,8 +1716,16 @@ def capture( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, + "encoder_seq_lens_tensor": attn_metadata.decode_metadata.encoder_seq_lens_tensor, + "cross_slot_mapping": attn_metadata.decode_metadata.cross_slot_mapping, + "cross_block_tables": attn_metadata.decode_metadata.cross_block_tables, **kwargs, } + + #for key, value in self.input_buffers.items(): + # print(f'Key: {key}') + + #print('self.input_buffers.keys() ' + str(self.input_buffers.keys)) if intermediate_inputs is not None: self.input_buffers.update(intermediate_inputs.tensors) if get_pp_group().is_last_rank: @@ -1721,6 +1753,23 @@ def forward( self.input_buffers["positions"].copy_(positions, non_blocking=True) self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, non_blocking=True) + print('encoder_seq_lens_tensor.shape ' + str(attn_metadata.encoder_seq_lens_tensor.shape)) + print('encoder_seq_lens_tensor.shape ' + str(self.input_buffers["encoder_seq_lens_tensor"].shape)) + + self.input_buffers["encoder_seq_lens_tensor"].copy_( + attn_metadata.encoder_seq_lens_tensor, non_blocking=True) + self.input_buffers["cross_slot_mapping"].copy_( + attn_metadata.cross_slot_mapping, non_blocking=True) + print('cross_block_tables.shape ' + str(attn_metadata.cross_block_tables.shape)) + print('cross_block_tables.shape ' + str(self.input_buffers["cross_block_tables"].shape)) + print('block_tables.shape ' + str(attn_metadata.decode_metadata.block_tables.shape)) + print('block_tables.shape ' + str(self.input_buffers["block_tables"].shape)) + + self.input_buffers["cross_block_tables"].copy_( + attn_metadata.cross_block_tables, non_blocking=True) + self.input_buffers["encoder_input_ids"].copy_(kwargs['encoder_input_ids'], non_blocking=True) + self.input_buffers["encoder_positions"].copy_(kwargs['encoder_positions'], non_blocking=True) + if self.backend_name != "flashinfer": self.input_buffers["seq_lens_tensor"].copy_( attn_metadata.decode_metadata.seq_lens_tensor, diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 8df3c8bc5408b..ea61e512c55c8 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -47,9 +47,9 @@ def assert_enc_dec_mr_supported_scenario( raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC']) - if not enc_dec_mr.model_config.enforce_eager: - raise NotImplementedError( - STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH']) + #if not enc_dec_mr.model_config.enforce_eager: + # raise NotImplementedError( + # STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH']) if enc_dec_mr.prompt_adapter_config is not None: raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[ From fbc8837992af907b919cb4d87ed0ba15f7a1874e Mon Sep 17 00:00:00 2001 From: sourashis Date: Wed, 21 Aug 2024 07:30:33 +0000 Subject: [PATCH 02/38] Add logic for CUDA Graph capture --- tests/models/test_bart.py | 2 +- vllm/attention/backends/utils.py | 3 - vllm/attention/backends/xformers.py | 1 - vllm/attention/ops/paged_attn.py | 1 + vllm/config.py | 33 +--- vllm/model_executor/models/bart.py | 23 +-- vllm/worker/enc_dec_model_runner.py | 96 ++++++++--- vllm/worker/model_runner.py | 240 +++++++++++++++++++++------- 8 files changed, 265 insertions(+), 134 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 9bddcab971ffa..fbddaa24d3378 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -153,7 +153,7 @@ def test_models( # model we must explicitly specify enforce_eager=True in the VllmRunner # constructor. #print('test_case_prompts ' + str(test_case_prompts)) - with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) print('vllm_outputs ' + str(vllm_outputs)) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 91f7e62a3259e..e6b5f820c5fa0 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -195,7 +195,6 @@ def build(self, seq_lens: List[int], query_lens: List[int], -1 if cuda graph is not used. batch_size: The maybe padded batch size. """ - print('In build') for inter_data in self.input_builder.inter_data_list: self._add_seq_group(inter_data, self.input_builder.chunked_prefill_enabled) @@ -207,10 +206,8 @@ def build(self, seq_lens: List[int], query_lens: List[int], max_prefill_seq_len = max(self.prefill_seq_lens, default=0) max_decode_seq_len = max(self.curr_seq_lens, default=0) num_decode_tokens = self.num_decode_tokens - if use_captured_graph: - print('cuda_graph_pad_size ' + str(cuda_graph_pad_size)) self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) self.block_tables.extend([] * cuda_graph_pad_size) num_decode_tokens = batch_size diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 7e36509bff864..5518f2fe1ba4a 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -621,7 +621,6 @@ def forward( output[:num_prefill_tokens] = out if decode_meta := attn_metadata.decode_metadata: - ( seq_lens_arg, max_seq_len_arg, diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 37ffc9de12cef..842d3e39370fd 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -114,6 +114,7 @@ def forward_decode( output = torch.empty_like(query) block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape + print('max_seq_len ' + str(max_seq_len)) max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE) #print('max_seq_len ' + str(max_seq_len)) diff --git a/vllm/config.py b/vllm/config.py index 839aaaafc8d72..89ef94c160e2e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,9 +12,8 @@ from vllm.model_executor.models import ModelRegistry from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes, - cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_openvino, is_tpu, is_xpu, +from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, + is_cpu, is_hip, is_neuron, is_openvino, is_tpu, is_xpu, print_warning_once) if TYPE_CHECKING: @@ -163,32 +162,8 @@ def __init__( self.hf_text_config = get_hf_text_config(self.hf_config) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) - # Choose a default enforce_eager value if the user did not specify - # a value (enforce_eager is None) - if getattr(self.hf_config, 'is_encoder_decoder', False): - if self.enforce_eager is None: - # *Only for encoder/decoder models* and - # *only if enforce_eager is unset*, override - # to enforce_eager=True - # - # Add a logger message since it is *somewhat* non-intuitive that - # enforce_eager is True when the user has not specified its - # value. - logger.info("Forcing enforce_eager == True because " - "enforce_eager setting was unspecified and " - "CUDAGraph is not supported with encoder/ " - "decoder models.") - self.enforce_eager = True - - #if not self.enforce_eager: - # Eager mode explicitly disabled by user for an encoder/ - # decoder model; however CUDAGRAPH + encoder/decoder is - # not currently supported - # raise ValueError(STR_NOT_IMPL_ENC_DEC_CUDAGRAPH) - elif self.enforce_eager is None: - # *Only for decoder-only models*, enforce_eager - # defaults to False if unset. This is intuitive - # so no logging message needed. + # Set enforce_eager to False if the value is unset. + if self.enforce_eager is None: self.enforce_eager = False if (not self.disable_sliding_window diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index ee67dab23fd75..2a80641f6ee9f 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -353,8 +353,6 @@ def forward( # (afeldman-nm 2024/07/22) TODO: # Need a more efficient solution for q/k/v - #print('decoder_hidden_states.shape() ' + str(decoder_hidden_states.shape)) - #print('encoder_hidden_states.shape() ' + str(encoder_hidden_states.shape)) qkv_dec, _ = self.qkv_proj(decoder_hidden_states) q, _, _ = qkv_dec.split([self.q_size, self.kv_size, self.kv_size], dim=-1) @@ -365,18 +363,13 @@ def forward( qkv_enc, _ = self.qkv_proj(encoder_hidden_states) _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - #print('In 10') - #print('Cross Block Table shape ' + str(attn_metadata.cross_block_tables.shape)) - #print('Cross Block Table content ' + str(attn_metadata.cross_block_tables)) attn_output = self.attn(q, k, v, kv_cache, attn_metadata, attn_type=AttentionType.ENCODER_DECODER) - #print('In 11') output, _ = self.out_proj(attn_output) - #print('In 12') return output @@ -533,25 +526,17 @@ def forward( Decoder layer output torch.Tensor """ residual = decoder_hidden_states - #print('In 4') - #print('Decoder Block Table shape ' + str(attn_metadata.block_tables.shape)) - #print('Decoder Block Table content ' + str(attn_metadata.block_tables)) - - # Self Attention hidden_states = self.self_attn(hidden_states=decoder_hidden_states, kv_cache=kv_cache, attn_metadata=attn_metadata) - #print('In 5') hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) - #print('In 6') - + # Cross-Attention Block residual = hidden_states - #print('In 7') hidden_states = self.encoder_attn( decoder_hidden_states=hidden_states, @@ -559,7 +544,6 @@ def forward( attn_metadata=attn_metadata, encoder_hidden_states=encoder_hidden_states, ) - #print('In 8') hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -574,8 +558,6 @@ def forward( hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) - #print('In 9') - return hidden_states @@ -727,7 +709,6 @@ def forward(self, decoder_input_ids: torch.Tensor, Returns: Decoder output torch.Tensor """ - #print('In 1') inputs_embeds = self.embed_tokens(decoder_input_ids) # embed positions @@ -740,7 +721,6 @@ def forward(self, decoder_input_ids: torch.Tensor, hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) - #print('In 2') # decoder layers for idx, decoder_layer in enumerate(self.layers): @@ -750,7 +730,6 @@ def forward(self, decoder_input_ids: torch.Tensor, attn_metadata=attn_metadata, encoder_hidden_states=encoder_hidden_states, ) - #print('In 3') return hidden_states diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 7720a6b6c1db3..bf42d0db763e6 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -3,6 +3,7 @@ import itertools import torch import torch.distributed +import torch.nn as nn from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata) @@ -19,17 +20,73 @@ from vllm.sequence import (IntermediateTensors, PoolerOutput, SamplerOutput, SequenceGroupMetadata) from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad -from vllm.worker.model_runner import (_PAD_SLOT_ID, GPUModelRunnerBase, +from vllm.worker.model_runner import (_PAD_SLOT_ID, CUDAGraphRunner, + GPUModelRunnerBase, ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata, _get_graph_batch_size) + ModelInputForGPUWithSamplingMetadata, + _get_graph_batch_size) from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict) + _add_sampling_metadata_broadcastable_dict, +) from vllm.worker.utils import assert_enc_dec_mr_supported_scenario logger = init_logger(__name__) +class EncoderDecoderCUDAGraphRunner(CUDAGraphRunner): + + def __init__(self, model: nn.Module, backend_name: str): + super().__init__(model=model, backend_name=backend_name) + + def _save_input_buffers( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_inputs: Optional[IntermediateTensors], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ): + super()._save_input_buffers(input_ids=input_ids, + positions=positions, + intermediate_inputs=intermediate_inputs, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + **kwargs) + self.input_buffers["encoder_seq_lens_tensor"] = ( + attn_metadata.decode_metadata.encoder_seq_lens_tensor) + self.input_buffers["cross_slot_mapping"] = ( + attn_metadata.decode_metadata.cross_slot_mapping) + self.input_buffers["cross_block_tables"] = ( + attn_metadata.decode_metadata.cross_block_tables) + + def _populate_input_buffers_from_model_input( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], + **kwargs, + ): + super()._populate_input_buffers_from_model_input( + input_ids=input_ids, + positions=positions, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + **kwargs) + self.input_buffers["encoder_seq_lens_tensor"].copy_( + attn_metadata.encoder_seq_lens_tensor, non_blocking=True) + self.input_buffers["cross_slot_mapping"].copy_( + attn_metadata.cross_slot_mapping, non_blocking=True) + self.input_buffers["cross_block_tables"].copy_( + attn_metadata.cross_block_tables, non_blocking=True) + self.input_buffers["encoder_input_ids"].copy_( + kwargs['encoder_input_ids'], non_blocking=True) + self.input_buffers["encoder_positions"].copy_( + kwargs['encoder_positions'], non_blocking=True) + + @dataclasses.dataclass(frozen=True) class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): """ @@ -174,23 +231,18 @@ def execute_model( raise ValueError("num_steps > 1 is not supported in " "EncoderDecoderModelRunner") - if (model_input.attn_metadata.prefill_metadata is None and - model_input.attn_metadata.decode_metadata.use_cuda_graph): + if (model_input.attn_metadata is not None + and model_input.attn_metadata.prefill_metadata is None + and model_input.attn_metadata.decode_metadata.use_cuda_graph): print('Executing as cuda graph') assert model_input.input_tokens is not None graph_batch_size = model_input.input_tokens.shape[0] - model_executable = self.graph_runners[model_input.virtual_engine][ - graph_batch_size] - print('encoder_seq_lens_tensor.shape ' + str( - model_input.attn_metadata.encoder_seq_lens_tensor.shape)) - + model_executable = self.graph_runners[ + model_input.virtual_engine][graph_batch_size] else: print('Executing without cuda graph') model_executable = self.model - - #model_executable = self.model - seqlen_agnostic_kwargs = { "finished_requests_ids": model_input.finished_requests_ids, "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, @@ -226,7 +278,6 @@ def make_model_input_from_broadcasted_tensor_dict( attn_backend=self.attn_backend, ) - def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -250,6 +301,8 @@ def prepare_model_input( encoder_input_positions_tensor, ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list, model_input)) + print('attn_metadata.max_encoder_seq_len 1 ' + + str(attn_metadata.max_encoder_seq_len)) # Inject attn_metadata encoder/cross-attention fields & # encoder input tokens/positions into model_input. @@ -444,19 +497,22 @@ def _prepare_encoder_model_input_tensors( cross_block_table is None) else cross_block_table) # Convert cross-attention block tables to encoder input tensor - if model_input.attn_metadata.decode_metadata.use_cuda_graph: - #max_len = self.max_seq_len_to_capture + if (model_input.attn_metadata is not None + and model_input.attn_metadata.use_cuda_graph): max_len = self.get_max_block_per_batch() batch_size = len(encoder_seq_lens) graph_batch_size = _get_graph_batch_size(batch_size) assert graph_batch_size >= batch_size cuda_graph_pad_size = graph_batch_size - batch_size - print('cuda_graph_pad_size-1' + str(cuda_graph_pad_size)) - cross_block_tables.extend([[] for _ in range(cuda_graph_pad_size)]) - encoder_seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) + cross_block_tables.extend([[] + for _ in range(cuda_graph_pad_size) + ]) + encoder_seq_lens.extend( + itertools.repeat(1, cuda_graph_pad_size)) else: - max_len = max(len(block_table) for block_table in cross_block_tables) + max_len = max( + len(block_table) for block_table in cross_block_tables) cross_block_tables = make_tensor_with_pad( cross_block_tables, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3b485a86c211d..a5f03119b31d0 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -237,6 +237,7 @@ def __init__( prefix_cache_hit: bool = False, reinit: bool = False, reinit_use_defaults: bool = False, + encoder_seq_len: int = 0, ): if reinit: assert len(self.seq_ids) == len(seq_ids) # type: ignore @@ -250,6 +251,7 @@ def __init__( self.block_tables = block_tables self.computed_block_nums = computed_block_nums self.n_seqs = n_seqs + self.encoder_seq_len = encoder_seq_len if reinit: if len(self.seq_ids) == 1 and reinit_use_defaults: @@ -622,6 +624,11 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): assert n_seqs == 1 self.decode_only = False + encoder_seq_len = 0 + + if self.runner.model_config.is_encoder_decoder_model: + encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() + inter_data = self.init_cached_inter_data( request_id=seq_group_metadata.request_id, seq_ids=seq_ids, @@ -629,7 +636,8 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): block_tables=seq_group_metadata.block_tables, computed_block_nums=seq_group_metadata.computed_block_nums, reinit=True, - reinit_use_defaults=True) + reinit_use_defaults=True, + encoder_seq_len=encoder_seq_len) self.inter_data_list.append(inter_data) @@ -639,14 +647,18 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): for per_seq_group_fn in self.per_seq_group_compute_fns: per_seq_group_fn(inter_data, seq_group_metadata) - def _use_captured_graph(self, batch_size: int, - max_decode_seq_len: int) -> bool: + def _use_captured_graph(self, + batch_size: int, + max_decode_seq_len: int, + max_encoder_seq_len: int = 0) -> bool: print('batch_size ' + str(batch_size)) print('max_decode_seq_len ' + str(max_decode_seq_len)) - print('max_seq_len_to_capture ' + str(self.runner.max_seq_len_to_capture)) + print('max_seq_len_to_capture ' + + str(self.runner.max_seq_len_to_capture)) return (self.decode_only and not self.runner.model_config.enforce_eager and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] - and max_decode_seq_len <= self.runner.max_seq_len_to_capture) + and max_decode_seq_len <= self.runner.max_seq_len_to_capture + and max_encoder_seq_len <= self.runner.max_seq_len_to_capture) def build(self) -> ModelInputForGPU: """Finalize the builder intermediate data and @@ -669,15 +681,18 @@ def build(self) -> ModelInputForGPU: input_positions.extend(cur_input_positions) seq_lens = [] + query_lens = [] max_decode_seq_len = 0 + max_encoder_seq_len = 0 for inter_data in self.inter_data_list: seq_lens.extend(inter_data.seq_lens) + query_lens.extend(inter_data.query_lens) if not inter_data.is_prompt: max_decode_seq_len = max(max_decode_seq_len, max(inter_data.seq_lens)) - query_lens = [] - for inter_data in self.inter_data_list: - query_lens.extend(inter_data.query_lens) + if self.runner.model_config.is_encoder_decoder_model: + max_encoder_seq_len = max(max_encoder_seq_len, + inter_data.encoder_seq_len) # Mapping from request IDs to sequence IDs. Used for Jamba models # that manages the cache by itself. @@ -687,8 +702,10 @@ def build(self) -> ModelInputForGPU: } batch_size = len(input_tokens) - use_captured_graph = self._use_captured_graph(batch_size, - max_decode_seq_len) + use_captured_graph = self._use_captured_graph( + batch_size, + max_decode_seq_len, + max_encoder_seq_len=max_encoder_seq_len) #use_captured_graph = True # If cuda graph can be used, pad tensors accordingly. @@ -853,6 +870,7 @@ def __init__( self.graph_block_tables = np.zeros( (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), dtype=np.int32) + num_attn_heads = self.model_config.get_num_attention_heads( self.parallel_config) self.attn_backend = get_attn_backend( @@ -1215,11 +1233,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: slot_mapping.fill_(_PAD_SLOT_ID) seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() block_tables = torch.from_numpy(self.graph_block_tables).cuda() - cross_block_tables = torch.from_numpy(np.zeros( - (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), - dtype=np.int32)).cuda() - cross_slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda() - cross_slot_mapping.fill_(_PAD_SLOT_ID) intermediate_inputs = None if not get_pp_group().is_first_rank: intermediate_inputs = self.model.make_empty_intermediate_tensors( @@ -1319,7 +1332,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: prefill_wrapper=None) attn_metadata.begin_forward() else: - #print('Cross Block Tables shape ' + str(cross_block_tables.shape)) attn_metadata = self.attn_backend.make_metadata( num_prefills=0, num_prefill_tokens=0, @@ -1334,14 +1346,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: seq_start_loc=None, context_lens_tensor=None, block_tables=block_tables[:batch_size], - cross_slot_mapping=torch.tensor([], dtype=torch.int).cuda(), - cross_block_tables=cross_block_tables[:batch_size], - #cross_block_tables=None, - encoder_seq_lens=torch.full((batch_size,), self.max_seq_len_to_capture, dtype=torch.int).cuda(), - encoder_seq_lens_tensor=torch.full((batch_size,), self.max_seq_len_to_capture, dtype=torch.int).cuda(), - max_encoder_seq_len=self.max_seq_len_to_capture, use_cuda_graph=True, ) + if self.model_config.is_encoder_decoder_model: + # Add the other attention metadata params needed for + # encoder-decoder specific models. + self._update_captured_attn_metadata_for_enc_dec_model( + batch_size, attn_metadata, + self.attn_backend.get_name()) if self.lora_config: lora_mapping = LoRAMapping( @@ -1359,7 +1371,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: set(), prompt_adapter_mapping) graph_runner = CUDAGraphRunner( - self.model, self.attn_backend.get_name()) + self.model, self.attn_backend.get_name(), + self.model_config.is_encoder_decoder_model) if self.attn_backend.get_name() == "flashinfer": graph_runner.flashinfer_indptr_buffer = _indptr_buffer @@ -1370,7 +1383,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: decode_workspace_buffer graph_runner.flashinfer_decode_wrapper = \ decode_wrapper - capture_inputs = { "input_ids": input_tokens[:batch_size], @@ -1392,9 +1404,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: "memory_pool": self.graph_memory_pool, "stream": - graph_capture_context.stream, - "encoder_input_ids": torch.tensor([], dtype=torch.long).cuda(), - "encoder_positions": torch.tensor([], dtype=torch.long).cuda(), + graph_capture_context.stream } if self.has_seqlen_agnostic: # Only used by Mamba-based models CUDA graph atm (Jamba) @@ -1403,6 +1413,11 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.model.get_seqlen_agnostic_capture_inputs( batch_size) }) + if self.model_config.is_encoder_decoder_model: + # add the additional inputs to capture for encoder-decoder models. + self._update_inputs_to_capture_for_enc_dec_model( + capture_inputs) + graph_runner.capture(**capture_inputs) self.graph_memory_pool = graph_runner.graph.pool() self.graph_runners[virtual_engine][batch_size] = ( @@ -1413,6 +1428,52 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: # This usually takes < 10 seconds. logger.info("Graph capturing finished in %.0f secs.", elapsed_time) + def _update_inputs_to_capture_for_enc_dec_model(self, + capture_inputs: Dict[str, + Any]): + capture_inputs["encoder_input_ids"] = torch.tensor( + [], dtype=torch.long).cuda() + capture_inputs["encoder_positions"] = torch.tensor( + [], dtype=torch.long).cuda() + + def _update_captured_attn_metadata_for_enc_dec_model( + self, batch_size: int, attn_metadata: AttentionMetadata, + attn_backend_name: str): + """ + Updates the attention metadata parameters for CUDA graph capture in an + encoder-decoder model. + + This method modifies attention-related tensors and metadata required + for CUDA graph capture in encoder-decoder models. Specifically, it + updates the cross-attention and encoder sequence tensors in the + AttentionMetadata object. + + Args: + batch_size (int): The size of the batch for which CUDA graph + capture is being performed. + attn_metadata (AttentionMetadata): The AttentionMetadata object to + be updated with encoder-decoder specific parameters. + """ + # Ensure that attn_metadata is of type XFormersMetadata + assert (attn_backend_name == ''), \ + f"Expected attn_metadata to be of type XFormersMetadata, but got {type(attn_metadata).__name__}" + # During decode phase the cross_slot_mapping will be empty. Hence set + # an empty tensor for CUDA Graph capture. + attn_metadata.cross_slot_mapping = torch.tensor( + [], dtype=torch.int).cuda() + cross_block_tables = torch.from_numpy( + np.zeros( + (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), + dtype=np.int32)).cuda() + attn_metadata.cross_block_tables = cross_block_tables[:batch_size] + attn_metadata.encoder_seq_lens = torch.full( + (batch_size, ), self.max_seq_len_to_capture, + dtype=torch.int).cuda() + attn_metadata.encoder_seq_lens_tensor = torch.full( + (batch_size, ), self.max_seq_len_to_capture, + dtype=torch.int).cuda() + attn_metadata.max_encoder_seq_len = self.max_seq_len_to_capture + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -1615,7 +1676,10 @@ def execute_model( class CUDAGraphRunner: - def __init__(self, model: nn.Module, backend_name: str): + def __init__(self, + model: nn.Module, + backend_name: str, + is_encoder_decoder_model: bool = False): self.model = model self.backend_name = backend_name @@ -1630,6 +1694,7 @@ def __init__(self, model: nn.Module, backend_name: str): self.flashinfer_last_page_len_buffer: Optional[torch.Tensor] = None self.flashinfer_decode_wrapper: Optional[ CUDAGraphBatchDecodeWithPagedKVCacheWrapper] = None + self._is_encoder_decoder_model = is_encoder_decoder_model @property def graph(self): @@ -1655,7 +1720,6 @@ def capture( # kernel launches for initial benchmarking (e.g., Triton autotune). # Note one iteration is not enough for torch.jit.script for i in range(_NUM_WARMUP_ITERS): - #print('Runnng ' + str(i)) self.model( input_ids, positions, @@ -1665,11 +1729,8 @@ def capture( **kwargs, ) torch.cuda.synchronize() - #print('Done Runnng') - # Capture the graph. self._graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): - #print('Here') output_hidden_or_intermediate_states = self.model( input_ids, positions, @@ -1706,8 +1767,6 @@ def capture( **kwargs, } else: - print('encoder_seq_lens_tensor.shape ' + str( - attn_metadata.decode_metadata.encoder_seq_lens_tensor.shape)) self.input_buffers = { "input_ids": input_ids, "positions": positions, @@ -1716,18 +1775,15 @@ def capture( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, - "encoder_seq_lens_tensor": attn_metadata.decode_metadata.encoder_seq_lens_tensor, - "cross_slot_mapping": attn_metadata.decode_metadata.cross_slot_mapping, - "cross_block_tables": attn_metadata.decode_metadata.cross_block_tables, **kwargs, } - - #for key, value in self.input_buffers.items(): - # print(f'Key: {key}') - - #print('self.input_buffers.keys() ' + str(self.input_buffers.keys)) + if intermediate_inputs is not None: self.input_buffers.update(intermediate_inputs.tensors) + + if self._is_encoder_decoder_model: + self._update_input_buffers_for_enc_dec_model(attn_metadata) + if get_pp_group().is_last_rank: self.output_buffers = { "hidden_states": hidden_or_intermediate_states @@ -1753,22 +1809,6 @@ def forward( self.input_buffers["positions"].copy_(positions, non_blocking=True) self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, non_blocking=True) - print('encoder_seq_lens_tensor.shape ' + str(attn_metadata.encoder_seq_lens_tensor.shape)) - print('encoder_seq_lens_tensor.shape ' + str(self.input_buffers["encoder_seq_lens_tensor"].shape)) - - self.input_buffers["encoder_seq_lens_tensor"].copy_( - attn_metadata.encoder_seq_lens_tensor, non_blocking=True) - self.input_buffers["cross_slot_mapping"].copy_( - attn_metadata.cross_slot_mapping, non_blocking=True) - print('cross_block_tables.shape ' + str(attn_metadata.cross_block_tables.shape)) - print('cross_block_tables.shape ' + str(self.input_buffers["cross_block_tables"].shape)) - print('block_tables.shape ' + str(attn_metadata.decode_metadata.block_tables.shape)) - print('block_tables.shape ' + str(self.input_buffers["block_tables"].shape)) - - self.input_buffers["cross_block_tables"].copy_( - attn_metadata.cross_block_tables, non_blocking=True) - self.input_buffers["encoder_input_ids"].copy_(kwargs['encoder_input_ids'], non_blocking=True) - self.input_buffers["encoder_positions"].copy_(kwargs['encoder_positions'], non_blocking=True) if self.backend_name != "flashinfer": self.input_buffers["seq_lens_tensor"].copy_( @@ -1784,6 +1824,9 @@ def forward( if key != "model_execute_time": self.input_buffers[key].copy_(intermediate_tensors[key], non_blocking=True) + if self._is_encoder_decoder_model: + self._populate_input_buffers_from_enc_dec_model_input( + attn_metadata, **kwargs) # Run the graph. self.graph.replay() # Return the output tensor. @@ -1792,6 +1835,87 @@ def forward( return self.output_buffers + def _update_input_buffers_for_enc_dec_model( + self, attn_metadata: AttentionMetadata): + self.input_buffers["encoder_seq_lens_tensor"] = ( + attn_metadata.decode_metadata.encoder_seq_lens_tensor) + self.input_buffers["cross_slot_mapping"] = ( + attn_metadata.decode_metadata.cross_slot_mapping) + self.input_buffers["cross_block_tables"] = ( + attn_metadata.decode_metadata.cross_block_tables) + + def _populate_input_buffers_from_enc_dec_model_input( + self, attn_metadata: AttentionMetadata, **kwargs): + self.input_buffers["encoder_seq_lens_tensor"].copy_( + attn_metadata.encoder_seq_lens_tensor, non_blocking=True) + self.input_buffers["cross_slot_mapping"].copy_( + attn_metadata.cross_slot_mapping, non_blocking=True) + self.input_buffers["cross_block_tables"].copy_( + attn_metadata.cross_block_tables, non_blocking=True) + self.input_buffers["encoder_input_ids"].copy_( + kwargs['encoder_input_ids'], non_blocking=True) + self.input_buffers["encoder_positions"].copy_( + kwargs['encoder_positions'], non_blocking=True) + + def _save_input_buffers( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_inputs: Optional[IntermediateTensors], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ): + # Save the input and output buffers. + if self.backend_name == "flashinfer": + self.input_buffers = { + "input_ids": input_ids, + "positions": positions, + "kv_caches": kv_caches, + "slot_mapping": attn_metadata.slot_mapping, + **kwargs, + } + else: + self.input_buffers = { + "input_ids": input_ids, + "positions": positions, + "kv_caches": kv_caches, + "slot_mapping": attn_metadata.slot_mapping, + "seq_lens_tensor": + attn_metadata.decode_metadata.seq_lens_tensor, + "block_tables": attn_metadata.decode_metadata.block_tables, + **kwargs, + } + if intermediate_inputs is not None: + self.input_buffers.update(intermediate_inputs.tensors) + + def _populate_input_buffers_from_model_input( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], + **kwargs, + ): + self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) + self.input_buffers["positions"].copy_(positions, non_blocking=True) + self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, + non_blocking=True) + if self.backend_name != "flashinfer": + self.input_buffers["seq_lens_tensor"].copy_( + attn_metadata.decode_metadata.seq_lens_tensor, + non_blocking=True) + self.input_buffers["block_tables"].copy_( + attn_metadata.decode_metadata.block_tables, non_blocking=True) + if "seqlen_agnostic_capture_inputs" in self.input_buffers: + self.model.copy_inputs_before_cuda_graphs(self.input_buffers, + **kwargs) + if intermediate_tensors is not None: + for key in intermediate_tensors.tensors: + if key != "model_execute_time": + self.input_buffers[key].copy_(intermediate_tensors[key], + non_blocking=True) + def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) From b3b4e4ab40a5ef16a0758bb8204d32c4c4d284c0 Mon Sep 17 00:00:00 2001 From: sourashis Date: Wed, 21 Aug 2024 07:32:38 +0000 Subject: [PATCH 03/38] Remove extra line --- vllm/attention/backends/xformers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 5518f2fe1ba4a..7e36509bff864 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -621,6 +621,7 @@ def forward( output[:num_prefill_tokens] = out if decode_meta := attn_metadata.decode_metadata: + ( seq_lens_arg, max_seq_len_arg, From bc9c32ee6f4cdddeb7f41005b6a56943df74612c Mon Sep 17 00:00:00 2001 From: sourashis Date: Wed, 21 Aug 2024 07:33:58 +0000 Subject: [PATCH 04/38] Remove debugs --- vllm/attention/ops/paged_attn.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 842d3e39370fd..92023d5b75f5a 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -114,15 +114,8 @@ def forward_decode( output = torch.empty_like(query) block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape - print('max_seq_len ' + str(max_seq_len)) max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE) - #print('max_seq_len ' + str(max_seq_len)) - #print('max_num_partitions ' + str(max_num_partitions)) - #print('num_seqs ' + str(num_seqs)) - #print('num_heads ' + str(num_heads)) - #print('seq_lens ' + str(seq_lens)) - #print('block_tables ' + str(block_tables)) # NOTE(woosuk): We use a simple heuristic to decide whether to use # PagedAttention V1 or V2. If the number of partitions is 1, we use # V1 to avoid the overhead of reduction. Also, if the number of From 33342fbb8e843ecfee94cfddf6d0ed61ef73ee78 Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 22 Aug 2024 03:26:57 +0000 Subject: [PATCH 05/38] Add comments --- tests/models/test_bart.py | 132 +++++++++++++++++- .../test_encoder_decoder_model_runner.py | 18 ++- vllm/worker/enc_dec_model_runner.py | 63 +-------- vllm/worker/model_runner.py | 128 ++++++++--------- 4 files changed, 208 insertions(+), 133 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index fbddaa24d3378..cfb60caf31d25 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -152,11 +152,137 @@ def test_models( # decoder-only unit tests expect), so when testing an encoder/decoder # model we must explicitly specify enforce_eager=True in the VllmRunner # constructor. - #print('test_case_prompts ' + str(test_case_prompts)) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + with vllm_runner(model, dtype=dtype, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + test_case_prompts, max_tokens, num_logprobs) + + hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE + else 0) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) + + + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) + @pytest.mark.parametrize("dtype", ["bfloat16"]) + @pytest.mark.parametrize("max_tokens", [2048]) + @pytest.mark.parametrize("num_logprobs", [5]) + @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) + def test_model_with_cuda_graph( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts, + model: str, + dtype: str, + max_tokens: int, + num_logprobs: int, + decoder_prompt_type: DecoderPromptType, + ) -> None: + ''' + Test the vLLM BART model for a variety of encoder/decoder input prompts, + by validating it against HuggingFace (HF) BART. + + Arguments: + + * hf_runner: HuggingFace (HF) test model runner + * vllm_runner: vLLM test model runner + * example_encoder_decoder_prompts: test fixture which provides a + dictionary of dummy prompts + * model: the HF ID of the specific BART variant under test + * dtype: the tensor datatype to employ + * max_tokens + * num_logprobs + * decoder_prompt_type: key into the example_encoder_decoder_prompts + dictionary; selects specific encoder/decoder + prompt scenarios to test + + A note on using HF BART as a baseline for validating vLLM BART, + specifically when the decoder prompt is None. + + The HF GenerationMixin's default behavior is to force the first + decoded token to be if the prompt does not already contain + (this is accomplished using a logit + processor setting.) + + So when we use HF BART as our baseline for comparison, note that + when the user provides a request with a None decoder prompt + (i.e. a singleton encoder prompt, or else an explicit encoder/ + decoder prompt with the decoder sub-prompt set to None), HF and + vLLM handle this in different ways: + + * HF will (1) tokenize the None prompt as an empty token-list, + (2) append to the beginning, yielding + [], (3) pass this token list to the model, and + then (4) after computing logits during prefill, override the model + logits & force to be the first generated token. + + * vLLM will (1) tokenize the None prompt as [], (2) append decoder- + start-token to the beginning, yielding [], + (3) pass these tokens to the model & proceed with generation. + + The net effect is that compared to vLLM, the list of HF *decoded* tokens + will contain one more initial than the vLLM generated tokens, + because vLLM's token is injected into the prompt rather than into + the generated output. This is in spite of the fact that overall, the + complete sequences (prompt + decoded tokens) produced by vLLM will match + HF. + + So when we use HF decoded token output to validate vLLM's decoded token + output, the testing process must account for the difference in decoded + token sequences between vLLM and HF specifically in the + decoder-prompt-is-None case. + + One option is to disable the logit processor feature that forces the + token to be decoded (forced_bos_token_id = None), eliminating + the problem entirely. However this is not "normal" BART usage. + + The other option is - only in the decoder-prompt-is-None case - to + discard the first decoded token from the HF output before comparing it + to vLLM. + + To that end, when testing the scenario where the decoder prompt is None + (and only in that one scenario), this test skips the first HF decoded + token during the process of validating the vLLM decoded output. + ''' + + test_case_prompts = example_encoder_decoder_prompts[ + decoder_prompt_type] + + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + is_encoder_decoder_model=True) as hf_model: + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + test_case_prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + + with vllm_runner(model, dtype=dtype, + enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) - print('vllm_outputs ' + str(vllm_outputs)) hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE else 0) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 8a2e9b81580fc..9f8bf6b7b4c05 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -11,9 +11,10 @@ # CUDA graph scenarios to test # # Currently CUDA graph is not supported -ENFORCE_EAGER = [True] +ENFORCE_EAGER = [True, False] -BATCH_SIZES = [1, 4, 16, 64, 256] +#BATCH_SIZES = [1, 4, 6, 16, 64, 256] +BATCH_SIZES = [6] def _create_model_runner(model: str, *args, @@ -349,6 +350,13 @@ def test_prepare_decode( encoder_input_tokens = model_input.encoder_input_tokens encoder_input_positions = model_input.encoder_input_positions cross_slot_mapping = attn_metadata.cross_slot_mapping + + # If CUDA Graph is enabled then tensors will be padded + # to fix length. Fix the expected tensors. + if enforce_eager is False: + + + assert return_seq_lens == seq_lens assert len(slot_mapping) == len(input_tokens) assert len(cross_slot_mapping) == len(encoder_input_tokens) @@ -422,8 +430,10 @@ def test_prepare_decode( expected, ) - # Cuda graph should is currently not supported for encoder/decoer. - assert attn_metadata.use_cuda_graph is False + # Verify CUDA Graph related setting + assert attn_metadata.use_cuda_graph is not enforce_eager + + # Verify the lengths of input tokens & positions # - Decoder diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index bf42d0db763e6..c10f6a7be9705 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -34,59 +34,6 @@ logger = init_logger(__name__) -class EncoderDecoderCUDAGraphRunner(CUDAGraphRunner): - - def __init__(self, model: nn.Module, backend_name: str): - super().__init__(model=model, backend_name=backend_name) - - def _save_input_buffers( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_inputs: Optional[IntermediateTensors], - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - **kwargs, - ): - super()._save_input_buffers(input_ids=input_ids, - positions=positions, - intermediate_inputs=intermediate_inputs, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - **kwargs) - self.input_buffers["encoder_seq_lens_tensor"] = ( - attn_metadata.decode_metadata.encoder_seq_lens_tensor) - self.input_buffers["cross_slot_mapping"] = ( - attn_metadata.decode_metadata.cross_slot_mapping) - self.input_buffers["cross_block_tables"] = ( - attn_metadata.decode_metadata.cross_block_tables) - - def _populate_input_buffers_from_model_input( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors], - **kwargs, - ): - super()._populate_input_buffers_from_model_input( - input_ids=input_ids, - positions=positions, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - **kwargs) - self.input_buffers["encoder_seq_lens_tensor"].copy_( - attn_metadata.encoder_seq_lens_tensor, non_blocking=True) - self.input_buffers["cross_slot_mapping"].copy_( - attn_metadata.cross_slot_mapping, non_blocking=True) - self.input_buffers["cross_block_tables"].copy_( - attn_metadata.cross_block_tables, non_blocking=True) - self.input_buffers["encoder_input_ids"].copy_( - kwargs['encoder_input_ids'], non_blocking=True) - self.input_buffers["encoder_positions"].copy_( - kwargs['encoder_positions'], non_blocking=True) - - @dataclasses.dataclass(frozen=True) class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): """ @@ -496,14 +443,16 @@ def _prepare_encoder_model_input_tensors( cross_block_tables.append([] if ( cross_block_table is None) else cross_block_table) - # Convert cross-attention block tables to encoder input tensor if (model_input.attn_metadata is not None and model_input.attn_metadata.use_cuda_graph): - max_len = self.get_max_block_per_batch() + # We will be using CUDA graph replay for this decode. + max_len_of_block_table = self.get_max_block_per_batch() batch_size = len(encoder_seq_lens) graph_batch_size = _get_graph_batch_size(batch_size) assert graph_batch_size >= batch_size cuda_graph_pad_size = graph_batch_size - batch_size + # extend the cross_block_tables and encoder_seq_lens to match + # the graph_batch_size. cross_block_tables.extend([[] for _ in range(cuda_graph_pad_size) ]) @@ -511,12 +460,12 @@ def _prepare_encoder_model_input_tensors( itertools.repeat(1, cuda_graph_pad_size)) else: - max_len = max( + max_len_of_block_table = max( len(block_table) for block_table in cross_block_tables) cross_block_tables = make_tensor_with_pad( cross_block_tables, - max_len=max_len, + max_len=max_len_of_block_table, pad=0, dtype=torch.int32, device=self.device, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a5f03119b31d0..b4d95d4b078b6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1351,6 +1351,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: if self.model_config.is_encoder_decoder_model: # Add the other attention metadata params needed for # encoder-decoder specific models. + assert self.attn_backend.get_name() == "xformers", \ + f"Encoder Decoder models only work with xformers backend. " \ + "The current backend is set to '{self.attn_backend.get_name()}'" self._update_captured_attn_metadata_for_enc_dec_model( batch_size, attn_metadata, self.attn_backend.get_name()) @@ -1431,6 +1434,20 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: def _update_inputs_to_capture_for_enc_dec_model(self, capture_inputs: Dict[str, Any]): + """ + Updates the set of input tensors needed for CUDA graph capture in an + encoder-decoder model. + + This method modifies the provided `capture_inputs` dictionary by adding + tensors specific to encoder-decoder specific models that need to be captured + for CUDA Graph replay. + + Args: + capture_inputs (Dict[str, Any]): A dictionary where input tensors are stored + for CUDA Graph capture. + """ + # During the decode phase encoder_input_ids and encoder_positions are unset. + # Do the same thing for graph capture. capture_inputs["encoder_input_ids"] = torch.tensor( [], dtype=torch.long).cuda() capture_inputs["encoder_positions"] = torch.tensor( @@ -1454,9 +1471,6 @@ def _update_captured_attn_metadata_for_enc_dec_model( attn_metadata (AttentionMetadata): The AttentionMetadata object to be updated with encoder-decoder specific parameters. """ - # Ensure that attn_metadata is of type XFormersMetadata - assert (attn_backend_name == ''), \ - f"Expected attn_metadata to be of type XFormersMetadata, but got {type(attn_metadata).__name__}" # During decode phase the cross_slot_mapping will be empty. Hence set # an empty tensor for CUDA Graph capture. attn_metadata.cross_slot_mapping = torch.tensor( @@ -1467,10 +1481,10 @@ def _update_captured_attn_metadata_for_enc_dec_model( dtype=np.int32)).cuda() attn_metadata.cross_block_tables = cross_block_tables[:batch_size] attn_metadata.encoder_seq_lens = torch.full( - (batch_size, ), self.max_seq_len_to_capture, + (batch_size, ), 1, dtype=torch.int).cuda() attn_metadata.encoder_seq_lens_tensor = torch.full( - (batch_size, ), self.max_seq_len_to_capture, + (batch_size, ), 1, dtype=torch.int).cuda() attn_metadata.max_encoder_seq_len = self.max_seq_len_to_capture @@ -1782,7 +1796,7 @@ def capture( self.input_buffers.update(intermediate_inputs.tensors) if self._is_encoder_decoder_model: - self._update_input_buffers_for_enc_dec_model(attn_metadata) + self._save_extra_input_buffers_for_enc_dec_model(attn_metadata) if get_pp_group().is_last_rank: self.output_buffers = { @@ -1835,8 +1849,21 @@ def forward( return self.output_buffers - def _update_input_buffers_for_enc_dec_model( + def _save_extra_input_buffers_for_enc_dec_model( self, attn_metadata: AttentionMetadata): + """ + Saves additional input buffers specific to the encoder-decoder model + from the attention metadata. + + This method extracts and stores encoder-decoder related input buffers + from the `attn_metadata` into the `input_buffers` dictionary. The buffers include + encoder sequence lengths, cross-slot mappings, and cross-block tables, which are + essential for the encoder-decoder model during CUDA graph replay. + + Args: + attn_metadata (AttentionMetadata): The attention metadata object from which the + encoder-decoder-specific input buffers are extracted and saved. + """ self.input_buffers["encoder_seq_lens_tensor"] = ( attn_metadata.decode_metadata.encoder_seq_lens_tensor) self.input_buffers["cross_slot_mapping"] = ( @@ -1846,76 +1873,39 @@ def _update_input_buffers_for_enc_dec_model( def _populate_input_buffers_from_enc_dec_model_input( self, attn_metadata: AttentionMetadata, **kwargs): + """ + Populates input buffers with data from the encoder-decoder model's + input and attention metadata. + + This method fills the input buffers with encoder-decoder specific + tensors. It copies data from the `attn_metadata` and keyword arguments + (`kwargs`) into corresponding buffers in the `input_buffers` dictionary. + The copied data includes attention-related metadata as well as input + IDs and positional information for the encoder. + + Args: + attn_metadata (AttentionMetadata): The attention metadata object + containing encoder-decoder-specific tensors (such as + `encoder_seq_lens_tensor`, `cross_slot_mapping`, and + `cross_block_tables`) that are copied into the input buffers. + + **kwargs: Additional keyword arguments containing encoder + specific input. + """ self.input_buffers["encoder_seq_lens_tensor"].copy_( - attn_metadata.encoder_seq_lens_tensor, non_blocking=True) + attn_metadata.decode_metadata.encoder_seq_lens_tensor, + non_blocking=True) self.input_buffers["cross_slot_mapping"].copy_( - attn_metadata.cross_slot_mapping, non_blocking=True) + attn_metadata.decode_metadata.cross_slot_mapping, + non_blocking=True) self.input_buffers["cross_block_tables"].copy_( - attn_metadata.cross_block_tables, non_blocking=True) + attn_metadata.decode_metadata.cross_block_tables, + non_blocking=True) self.input_buffers["encoder_input_ids"].copy_( kwargs['encoder_input_ids'], non_blocking=True) self.input_buffers["encoder_positions"].copy_( kwargs['encoder_positions'], non_blocking=True) - def _save_input_buffers( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_inputs: Optional[IntermediateTensors], - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - **kwargs, - ): - # Save the input and output buffers. - if self.backend_name == "flashinfer": - self.input_buffers = { - "input_ids": input_ids, - "positions": positions, - "kv_caches": kv_caches, - "slot_mapping": attn_metadata.slot_mapping, - **kwargs, - } - else: - self.input_buffers = { - "input_ids": input_ids, - "positions": positions, - "kv_caches": kv_caches, - "slot_mapping": attn_metadata.slot_mapping, - "seq_lens_tensor": - attn_metadata.decode_metadata.seq_lens_tensor, - "block_tables": attn_metadata.decode_metadata.block_tables, - **kwargs, - } - if intermediate_inputs is not None: - self.input_buffers.update(intermediate_inputs.tensors) - - def _populate_input_buffers_from_model_input( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors], - **kwargs, - ): - self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) - self.input_buffers["positions"].copy_(positions, non_blocking=True) - self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, - non_blocking=True) - if self.backend_name != "flashinfer": - self.input_buffers["seq_lens_tensor"].copy_( - attn_metadata.decode_metadata.seq_lens_tensor, - non_blocking=True) - self.input_buffers["block_tables"].copy_( - attn_metadata.decode_metadata.block_tables, non_blocking=True) - if "seqlen_agnostic_capture_inputs" in self.input_buffers: - self.model.copy_inputs_before_cuda_graphs(self.input_buffers, - **kwargs) - if intermediate_tensors is not None: - for key in intermediate_tensors.tensors: - if key != "model_execute_time": - self.input_buffers[key].copy_(intermediate_tensors[key], - non_blocking=True) - def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) From 599cd6b34679a482b8652b7f3b1b829b023fc2b9 Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 01:07:37 +0000 Subject: [PATCH 06/38] Move logic to backend/utils.py --- tests/models/test_bart.py | 8 +- .../test_encoder_decoder_model_runner.py | 23 +-- vllm/attention/backends/abstract.py | 17 +- vllm/attention/backends/flashinfer.py | 12 +- vllm/attention/backends/utils.py | 114 +++++++++++- vllm/config.py | 5 +- vllm/worker/enc_dec_model_runner.py | 11 +- vllm/worker/model_runner.py | 170 +++++------------- 8 files changed, 190 insertions(+), 170 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index e1e4576bc788b..8f3db94f8d8b4 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -4,8 +4,6 @@ """ from typing import List, Optional, Tuple -from transformers import TransfoXLForSequenceClassification - from vllm.utils import is_cpu if not is_cpu(): @@ -153,8 +151,7 @@ def test_models( # decoder-only unit tests expect), so when testing an encoder/decoder # model we must explicitly specify enforce_eager=True in the VllmRunner # constructor. - with vllm_runner(model, dtype=dtype, - enforce_eager=True) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) @@ -172,7 +169,6 @@ def test_models( num_outputs_0_skip_tokens=hf_skip_tokens, ) - @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [2048]) @@ -271,7 +267,7 @@ def test_model_with_cuda_graph( } with hf_runner(model, dtype=dtype, - is_encoder_decoder_model=True) as hf_model: + auto_cls=AutoModelForSeq2SeqLM) as hf_model: hf_outputs = ( hf_model.generate_encoder_decoder_greedy_logprobs_limit( test_case_prompts, diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 5ce1ae3bfd7e3..94d74df31faf0 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -3,20 +3,18 @@ import pytest import torch +import itertools from vllm.engine.arg_utils import EngineArgs from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, SequenceData, SequenceGroupMetadata) from vllm.utils import is_cpu +from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner -# CUDA graph scenarios to test -# -# Currently CUDA graph is not supported -ENFORCE_EAGER = [True, False] +ENFORCE_EAGER = [True] -#BATCH_SIZES = [1, 4, 6, 16, 64, 256] -BATCH_SIZES = [6] +BATCH_SIZES = [1, 4, 6, 16, 64, 256] def _create_model_runner(model: str, *args, @@ -356,13 +354,6 @@ def test_prepare_decode( encoder_input_tokens = model_input.encoder_input_tokens encoder_input_positions = model_input.encoder_input_positions cross_slot_mapping = attn_metadata.cross_slot_mapping - - # If CUDA Graph is enabled then tensors will be padded - # to fix length. Fix the expected tensors. - if enforce_eager is False: - - - assert return_seq_lens == seq_lens assert len(slot_mapping) == len(input_tokens) assert len(cross_slot_mapping) == len(encoder_input_tokens) @@ -436,10 +427,8 @@ def test_prepare_decode( expected, ) - # Verify CUDA Graph related setting - assert attn_metadata.use_cuda_graph is not enforce_eager - - + # Cuda graph should is currently not supported for encoder/decoer. + assert attn_metadata.use_cuda_graph is False # Verify the lengths of input tokens & positions # - Decoder diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index ccfc6b254c1e7..2a981a2fb7c59 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -154,18 +154,27 @@ def graph_clone(self, batch_size: int) -> "AttentionState[T]": ... @abstractmethod - def graph_capture_get_metadata_for_batch(self, batch_size: int) -> T: + def graph_capture_get_metadata_for_batch( + self, + batch_size: int, + is_encoder_decoder_model: bool = False) -> T: """Get attention metadata for CUDA graph capture of batch_size.""" ... @abstractmethod - def get_graph_input_buffers(self, attn_metadata: T) -> Dict[str, Any]: + def get_graph_input_buffers( + self, + attn_metadata: T, + is_encoder_decoder_model: bool = False) -> Dict[str, Any]: """Get attention-specific input buffers for CUDA graph capture.""" ... @abstractmethod - def prepare_graph_input_buffers(self, input_buffers: Dict[str, Any], - attn_metadata: T) -> None: + def prepare_graph_input_buffers( + self, + input_buffers: Dict[str, Any], + attn_metadata: T, + is_encoder_decoder_model: bool = False) -> None: """In-place modify input buffers dict for CUDA graph replay.""" ... diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index ce7a7198dc400..cdf6185ad9a11 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -163,7 +163,8 @@ def graph_clone(self, batch_size: int): state._prefill_wrapper = self._get_prefill_wrapper() return state - def graph_capture_get_metadata_for_batch(self, batch_size: int): + def graph_capture_get_metadata_for_batch( + self, batch_size: int, is_encoder_decoder_model: bool = False): assert self._is_graph_capturing _indptr_buffer = self._graph_indptr_buffer[:batch_size + 1] _last_page_len_buffer = self._graph_last_page_len_buffer[:batch_size] @@ -219,12 +220,17 @@ def graph_capture_get_metadata_for_batch(self, batch_size: int): attn_metadata.begin_forward() return attn_metadata - def get_graph_input_buffers(self, attn_metadata): + def get_graph_input_buffers(self, + attn_metadata, + is_encoder_decoder_model: bool = False): return { "slot_mapping": attn_metadata.slot_mapping, } - def prepare_graph_input_buffers(self, input_buffers, attn_metadata): + def prepare_graph_input_buffers(self, + input_buffers, + attn_metadata, + is_encoder_decoder_model: bool = False): return def begin_forward(self, model_input): diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 0375d3488eb15..b079725dd205d 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -304,7 +304,8 @@ def graph_clone(self, batch_size: int) -> "CommonAttentionState": assert self._is_graph_capturing return self.__class__(self.runner) - def graph_capture_get_metadata_for_batch(self, batch_size: int): + def graph_capture_get_metadata_for_batch( + self, batch_size: int, is_encoder_decoder_model: bool = False): assert self._is_graph_capturing attn_metadata = self.runner.attn_backend.make_metadata( num_prefills=0, @@ -322,21 +323,122 @@ def graph_capture_get_metadata_for_batch(self, batch_size: int): block_tables=self._graph_block_tables[:batch_size], use_cuda_graph=True, ) + if is_encoder_decoder_model: + print('Here !!!') + # The encoder decoder model works only with XFormers backend. + # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" + self._update_captured_metadata_for_enc_dec_model( + batch_size=batch_size, attn_metadata=attn_metadata) + return attn_metadata - def get_graph_input_buffers(self, attn_metadata) -> Dict[str, Any]: - return { + def get_graph_input_buffers( + self, + attn_metadata, + is_encoder_decoder_model: bool = False) -> Dict[str, Any]: + input_buffers = { "slot_mapping": attn_metadata.slot_mapping, "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } - - def prepare_graph_input_buffers(self, input_buffers, - attn_metadata) -> None: + if is_encoder_decoder_model: + # The encoder decoder model works only with XFormers backend. + # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" + self._add_additonal_input_buffers_for_enc_dec_model( + attn_metadata=attn_metadata, input_buffers=input_buffers) + return input_buffers + + def prepare_graph_input_buffers( + self, + input_buffers, + attn_metadata, + is_encoder_decoder_model: bool = False) -> None: input_buffers["seq_lens_tensor"].copy_( attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True) input_buffers["block_tables"].copy_( attn_metadata.decode_metadata.block_tables, non_blocking=True) + if is_encoder_decoder_model: + # The encoder decoder model works only with XFormers backend. + # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" + self._prepare_input_buffers_for_enc_dec_model( + attn_metadata, input_buffers) def begin_forward(self, model_input) -> None: return + + def _update_captured_metadata_for_enc_dec_model(self, batch_size: int, + attn_metadata): + """ + Updates the attention metadata parameters for CUDA graph capture in an + encoder-decoder model. + + This method modifies attention-related tensors and metadata required + for CUDA graph capture in encoder-decoder models. Specifically, it + updates the cross-attention and encoder sequence tensors in the + AttentionMetadata object. + """ + # During decode phase the cross_slot_mapping will be empty. Hence set + # an empty tensor for CUDA Graph capture. + attn_metadata.cross_slot_mapping = torch.tensor( + [], dtype=torch.int).cuda() + attn_metadata.cross_block_tables = torch.full( + (batch_size, self.runner.get_max_block_per_batch()), + 1, + dtype=torch.int).cuda() + attn_metadata.encoder_seq_lens = torch.full((batch_size, ), + 1, + dtype=torch.int).cuda() + attn_metadata.encoder_seq_lens_tensor = torch.full( + (batch_size, ), 1, dtype=torch.int).cuda() + attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture + + def _add_additonal_input_buffers_for_enc_dec_model( + self, attn_metadata, input_buffers: Dict[str, Any]): + """ + Saves additional input buffers specific to the encoder-decoder model + from the attention metadata. + + This method extracts and stores encoder-decoder related input buffers + from the `attn_metadata` into the `input_buffers` dictionary. The + buffers include encoder sequence lengths, cross-slot mappings, and + cross-block tables, which are essential for the encoder-decoder model + during CUDA graph replay. + """ + input_buffers["encoder_seq_lens_tensor"] = ( + attn_metadata.decode_metadata.encoder_seq_lens_tensor) + input_buffers["cross_slot_mapping"] = ( + attn_metadata.decode_metadata.cross_slot_mapping) + input_buffers["cross_block_tables"] = ( + attn_metadata.decode_metadata.cross_block_tables) + + def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata, + input_buffers: Dict[str, + Any]): + """ + Populates input buffers with data from the encoder-decoder model's + attention metadata. + + This method fills the input buffers with encoder-decoder specific + tensors. It copies data from the `attn_metadata` and keyword arguments + (`kwargs`) into corresponding buffers in the `input_buffers` dictionary. + The copied data includes attention-related metadata as well as input + IDs and positional information for the encoder. + """ + input_buffers["encoder_seq_lens_tensor"].copy_( + attn_metadata.decode_metadata.encoder_seq_lens_tensor, + non_blocking=True) + input_buffers["cross_slot_mapping"].copy_( + attn_metadata.decode_metadata.cross_slot_mapping, + non_blocking=True) + input_buffers["cross_block_tables"].copy_( + attn_metadata.decode_metadata.cross_block_tables, + non_blocking=True) diff --git a/vllm/config.py b/vllm/config.py index 84355f3172f4b..bf4238260bef4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -16,9 +16,8 @@ from vllm.transformers_utils.config import (get_config, get_hf_image_processor_config, get_hf_text_config) -from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes, - cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_openvino, is_xpu, +from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, + is_cpu, is_hip, is_neuron, is_openvino, is_xpu, print_warning_once) if TYPE_CHECKING: diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 0cc00aa6cce37..d09c1ad651a4b 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -1,9 +1,9 @@ import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type, cast import itertools +from typing import Any, Dict, List, Optional, Tuple, Type, cast + import torch import torch.distributed -import torch.nn as nn from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata) @@ -28,8 +28,7 @@ _get_graph_batch_size) from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, -) + _add_sampling_metadata_broadcastable_dict) from vllm.worker.utils import assert_enc_dec_mr_supported_scenario logger = init_logger(__name__) @@ -243,16 +242,12 @@ def prepare_model_input( """ model_input = self._prepare_model_input_tensors( seq_group_metadata_list, finished_requests_ids) - ( attn_metadata, encoder_input_tokens_tensor, encoder_input_positions_tensor, ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list, model_input)) - print('attn_metadata.max_encoder_seq_len 1 ' + - str(attn_metadata.max_encoder_seq_len)) - # Inject attn_metadata encoder/cross-attention fields & # encoder input tokens/positions into model_input. # Frozen dataclass fields cannot be modified, so use diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e1bdb022f324b..6ebf91edfc5e3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -640,10 +640,6 @@ def _use_captured_graph(self, batch_size: int, max_decode_seq_len: int, max_encoder_seq_len: int = 0) -> bool: - print('batch_size ' + str(batch_size)) - print('max_decode_seq_len ' + str(max_decode_seq_len)) - print('max_seq_len_to_capture ' + - str(self.runner.max_seq_len_to_capture)) return (self.decode_only and not self.runner.model_config.enforce_eager and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] and max_decode_seq_len <= self.runner.max_seq_len_to_capture @@ -707,8 +703,6 @@ def build(self) -> ModelInputForGPU: cuda_graph_pad_size = graph_batch_size - batch_size batch_size = graph_batch_size - #print('cuda_graph_pad_size ' + str(cuda_graph_pad_size)) - #print('use_captured_graph ' + str(use_captured_graph)) # Tokens and positions. if cuda_graph_pad_size: input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) @@ -1242,10 +1236,30 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: for virtual_engine in range( self.parallel_config.pipeline_parallel_size): for batch_size in reversed(batch_size_capture_list): - + attn_metadata = ( + self.attn_state.graph_capture_get_metadata_for_batch( + batch_size, + is_encoder_decoder_model=self.model_config. + is_encoder_decoder_model)) + + if self.lora_config: + lora_mapping = LoRAMapping( + **dict(index_mapping=[0] * batch_size, + prompt_mapping=[0] * batch_size, + is_prefill=False)) + self.set_active_loras(set(), lora_mapping) + + if self.prompt_adapter_config: + prompt_adapter_mapping = PromptAdapterMapping( + [-1] * batch_size, + [-1] * batch_size, + ) + self.set_active_prompt_adapters( + set(), prompt_adapter_mapping) graph_runner = CUDAGraphRunner( self.model, self.attn_backend.get_name(), - self.attn_state.graph_clone(batch_size)) + self.attn_state.graph_clone(batch_size), + self.model_config.is_encoder_decoder_model) capture_inputs = { "input_ids": @@ -1278,7 +1292,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: batch_size) }) if self.model_config.is_encoder_decoder_model: - # add the additional inputs to capture for encoder-decoder models. + # add the additional inputs to capture for + # encoder-decoder models. self._update_inputs_to_capture_for_enc_dec_model( capture_inputs) @@ -1299,56 +1314,17 @@ def _update_inputs_to_capture_for_enc_dec_model(self, Updates the set of input tensors needed for CUDA graph capture in an encoder-decoder model. - This method modifies the provided `capture_inputs` dictionary by adding - tensors specific to encoder-decoder specific models that need to be captured - for CUDA Graph replay. - - Args: - capture_inputs (Dict[str, Any]): A dictionary where input tensors are stored - for CUDA Graph capture. + This method modifies the provided `capture_inputs` dictionary by + adding tensors specific to encoder-decoder specific models that + need to be captured for CUDA Graph replay. """ - # During the decode phase encoder_input_ids and encoder_positions are unset. - # Do the same thing for graph capture. + # During the decode phase encoder_input_ids and encoder_positions are + # unset. Do the same thing for graph capture. capture_inputs["encoder_input_ids"] = torch.tensor( [], dtype=torch.long).cuda() capture_inputs["encoder_positions"] = torch.tensor( [], dtype=torch.long).cuda() - def _update_captured_attn_metadata_for_enc_dec_model( - self, batch_size: int, attn_metadata: AttentionMetadata, - attn_backend_name: str): - """ - Updates the attention metadata parameters for CUDA graph capture in an - encoder-decoder model. - - This method modifies attention-related tensors and metadata required - for CUDA graph capture in encoder-decoder models. Specifically, it - updates the cross-attention and encoder sequence tensors in the - AttentionMetadata object. - - Args: - batch_size (int): The size of the batch for which CUDA graph - capture is being performed. - attn_metadata (AttentionMetadata): The AttentionMetadata object to - be updated with encoder-decoder specific parameters. - """ - # During decode phase the cross_slot_mapping will be empty. Hence set - # an empty tensor for CUDA Graph capture. - attn_metadata.cross_slot_mapping = torch.tensor( - [], dtype=torch.int).cuda() - cross_block_tables = torch.from_numpy( - np.zeros( - (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), - dtype=np.int32)).cuda() - attn_metadata.cross_block_tables = cross_block_tables[:batch_size] - attn_metadata.encoder_seq_lens = torch.full( - (batch_size, ), 1, - dtype=torch.int).cuda() - attn_metadata.encoder_seq_lens_tensor = torch.full( - (batch_size, ), 1, - dtype=torch.int).cuda() - attn_metadata.max_encoder_seq_len = self.max_seq_len_to_capture - @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -1543,7 +1519,7 @@ def execute_model( class CUDAGraphRunner: def __init__(self, model: nn.Module, backend_name: str, - attn_state: AttentionState): + attn_state: AttentionState, is_encoder_decoder_model: bool): self.model = model self.backend_name = backend_name self.attn_state = attn_state @@ -1552,6 +1528,7 @@ def __init__(self, model: nn.Module, backend_name: str, self.output_buffers: Dict[str, torch.Tensor] = {} self._graph: Optional[torch.cuda.CUDAGraph] = None + self._is_encoder_decoder_model = is_encoder_decoder_model @property def graph(self): @@ -1576,7 +1553,7 @@ def capture( # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). # Note one iteration is not enough for torch.jit.script - for i in range(_NUM_WARMUP_ITERS): + for _ in range(_NUM_WARMUP_ITERS): self.model( input_ids, positions, @@ -1616,18 +1593,19 @@ def capture( # Save the input and output buffers. self.input_buffers = { - "input_ids": input_ids, - "positions": positions, - "kv_caches": kv_caches, - **self.attn_state.get_graph_input_buffers(attn_metadata), + "input_ids": + input_ids, + "positions": + positions, + "kv_caches": + kv_caches, + **self.attn_state.get_graph_input_buffers( + attn_metadata, self._is_encoder_decoder_model), **kwargs, } if intermediate_inputs is not None: self.input_buffers.update(intermediate_inputs.tensors) - if self._is_encoder_decoder_model: - self._save_extra_input_buffers_for_enc_dec_model(attn_metadata) - if get_pp_group().is_last_rank: self.output_buffers = { "hidden_states": hidden_or_intermediate_states @@ -1653,8 +1631,8 @@ def forward( self.input_buffers["positions"].copy_(positions, non_blocking=True) self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, non_blocking=True) - self.attn_state.prepare_graph_input_buffers(self.input_buffers, - attn_metadata) + self.attn_state.prepare_graph_input_buffers( + self.input_buffers, attn_metadata, self._is_encoder_decoder_model) if "seqlen_agnostic_capture_inputs" in self.input_buffers: self.model.copy_inputs_before_cuda_graphs(self.input_buffers, **kwargs) @@ -1664,8 +1642,11 @@ def forward( self.input_buffers[key].copy_(intermediate_tensors[key], non_blocking=True) if self._is_encoder_decoder_model: - self._populate_input_buffers_from_enc_dec_model_input( - attn_metadata, **kwargs) + self.input_buffers["encoder_input_ids"].copy_( + kwargs['encoder_input_ids'], non_blocking=True) + self.input_buffers["encoder_positions"].copy_( + kwargs['encoder_positions'], non_blocking=True) + # Run the graph. self.graph.replay() # Return the output tensor. @@ -1674,63 +1655,6 @@ def forward( return self.output_buffers - def _save_extra_input_buffers_for_enc_dec_model( - self, attn_metadata: AttentionMetadata): - """ - Saves additional input buffers specific to the encoder-decoder model - from the attention metadata. - - This method extracts and stores encoder-decoder related input buffers - from the `attn_metadata` into the `input_buffers` dictionary. The buffers include - encoder sequence lengths, cross-slot mappings, and cross-block tables, which are - essential for the encoder-decoder model during CUDA graph replay. - - Args: - attn_metadata (AttentionMetadata): The attention metadata object from which the - encoder-decoder-specific input buffers are extracted and saved. - """ - self.input_buffers["encoder_seq_lens_tensor"] = ( - attn_metadata.decode_metadata.encoder_seq_lens_tensor) - self.input_buffers["cross_slot_mapping"] = ( - attn_metadata.decode_metadata.cross_slot_mapping) - self.input_buffers["cross_block_tables"] = ( - attn_metadata.decode_metadata.cross_block_tables) - - def _populate_input_buffers_from_enc_dec_model_input( - self, attn_metadata: AttentionMetadata, **kwargs): - """ - Populates input buffers with data from the encoder-decoder model's - input and attention metadata. - - This method fills the input buffers with encoder-decoder specific - tensors. It copies data from the `attn_metadata` and keyword arguments - (`kwargs`) into corresponding buffers in the `input_buffers` dictionary. - The copied data includes attention-related metadata as well as input - IDs and positional information for the encoder. - - Args: - attn_metadata (AttentionMetadata): The attention metadata object - containing encoder-decoder-specific tensors (such as - `encoder_seq_lens_tensor`, `cross_slot_mapping`, and - `cross_block_tables`) that are copied into the input buffers. - - **kwargs: Additional keyword arguments containing encoder - specific input. - """ - self.input_buffers["encoder_seq_lens_tensor"].copy_( - attn_metadata.decode_metadata.encoder_seq_lens_tensor, - non_blocking=True) - self.input_buffers["cross_slot_mapping"].copy_( - attn_metadata.decode_metadata.cross_slot_mapping, - non_blocking=True) - self.input_buffers["cross_block_tables"].copy_( - attn_metadata.decode_metadata.cross_block_tables, - non_blocking=True) - self.input_buffers["encoder_input_ids"].copy_( - kwargs['encoder_input_ids'], non_blocking=True) - self.input_buffers["encoder_positions"].copy_( - kwargs['encoder_positions'], non_blocking=True) - def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) From a9ca02f9996aa0b8e138013ce9ec96fe8d06eeec Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 01:09:29 +0000 Subject: [PATCH 07/38] Fix import --- tests/worker/test_encoder_decoder_model_runner.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 94d74df31faf0..14ccf28bae203 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -3,13 +3,11 @@ import pytest import torch -import itertools from vllm.engine.arg_utils import EngineArgs from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, SequenceData, SequenceGroupMetadata) from vllm.utils import is_cpu -from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner ENFORCE_EAGER = [True] From 6b09ee86fa81b3c68d018c225b322a825171c99f Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 02:37:17 +0000 Subject: [PATCH 08/38] Fix formatting --- vllm/model_executor/models/bart.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 719f6fd21fd9b..46ae8933ccbb5 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -363,12 +363,14 @@ def forward( qkv_enc, _ = self.qkv_proj(encoder_hidden_states) _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata, attn_type=AttentionType.ENCODER_DECODER) + output, _ = self.out_proj(attn_output) return output @@ -526,6 +528,7 @@ def forward( Decoder layer output torch.Tensor """ residual = decoder_hidden_states + # Self Attention hidden_states = self.self_attn(hidden_states=decoder_hidden_states, kv_cache=kv_cache, @@ -709,6 +712,7 @@ def forward(self, decoder_input_ids: torch.Tensor, Returns: Decoder output torch.Tensor """ + inputs_embeds = self.embed_tokens(decoder_input_ids) # embed positions @@ -730,6 +734,7 @@ def forward(self, decoder_input_ids: torch.Tensor, attn_metadata=attn_metadata, encoder_hidden_states=encoder_hidden_states, ) + return hidden_states From c199b504572b0fb476730e13f9ec2a2702e6f1cf Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 02:39:48 +0000 Subject: [PATCH 09/38] Fix test documentation --- tests/models/test_bart.py | 66 ++------------------------------------- 1 file changed, 3 insertions(+), 63 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 8f3db94f8d8b4..9921329cf24cb 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -36,7 +36,7 @@ def vllm_to_hf_output( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) - @pytest.mark.parametrize("max_tokens", [2048]) + @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) def test_models( @@ -187,68 +187,8 @@ def test_model_with_cuda_graph( ''' Test the vLLM BART model for a variety of encoder/decoder input prompts, by validating it against HuggingFace (HF) BART. - - Arguments: - - * hf_runner: HuggingFace (HF) test model runner - * vllm_runner: vLLM test model runner - * example_encoder_decoder_prompts: test fixture which provides a - dictionary of dummy prompts - * model: the HF ID of the specific BART variant under test - * dtype: the tensor datatype to employ - * max_tokens - * num_logprobs - * decoder_prompt_type: key into the example_encoder_decoder_prompts - dictionary; selects specific encoder/decoder - prompt scenarios to test - - A note on using HF BART as a baseline for validating vLLM BART, - specifically when the decoder prompt is None. - - The HF GenerationMixin's default behavior is to force the first - decoded token to be if the prompt does not already contain - (this is accomplished using a logit - processor setting.) - - So when we use HF BART as our baseline for comparison, note that - when the user provides a request with a None decoder prompt - (i.e. a singleton encoder prompt, or else an explicit encoder/ - decoder prompt with the decoder sub-prompt set to None), HF and - vLLM handle this in different ways: - - * HF will (1) tokenize the None prompt as an empty token-list, - (2) append to the beginning, yielding - [], (3) pass this token list to the model, and - then (4) after computing logits during prefill, override the model - logits & force to be the first generated token. - - * vLLM will (1) tokenize the None prompt as [], (2) append decoder- - start-token to the beginning, yielding [], - (3) pass these tokens to the model & proceed with generation. - - The net effect is that compared to vLLM, the list of HF *decoded* tokens - will contain one more initial than the vLLM generated tokens, - because vLLM's token is injected into the prompt rather than into - the generated output. This is in spite of the fact that overall, the - complete sequences (prompt + decoded tokens) produced by vLLM will match - HF. - - So when we use HF decoded token output to validate vLLM's decoded token - output, the testing process must account for the difference in decoded - token sequences between vLLM and HF specifically in the - decoder-prompt-is-None case. - - One option is to disable the logit processor feature that forces the - token to be decoded (forced_bos_token_id = None), eliminating - the problem entirely. However this is not "normal" BART usage. - - The other option is - only in the decoder-prompt-is-None case - to - discard the first decoded token from the HF output before comparing it - to vLLM. - - To that end, when testing the scenario where the decoder prompt is None - (and only in that one scenario), this test skips the first HF decoded - token during the process of validating the vLLM decoded output. + This test is same as test_models above except that for this test we enable + CUDA Graph capture and replay. ''' test_case_prompts = example_encoder_decoder_prompts[ From 539b10e0d192bd03a23d6e90254d003e96ce6c14 Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 03:38:17 +0000 Subject: [PATCH 10/38] Remove debug stmt --- tests/models/test_bart.py | 7 +++---- vllm/attention/backends/utils.py | 1 - vllm/worker/enc_dec_model_runner.py | 2 -- vllm/worker/model_runner.py | 2 ++ vllm/worker/utils.py | 4 ---- 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 9921329cf24cb..c4e79ffec99f2 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -187,8 +187,8 @@ def test_model_with_cuda_graph( ''' Test the vLLM BART model for a variety of encoder/decoder input prompts, by validating it against HuggingFace (HF) BART. - This test is same as test_models above except that for this test we enable - CUDA Graph capture and replay. + This test is same as test_models above except that for this test we + enable CUDA Graph capture and replay. ''' test_case_prompts = example_encoder_decoder_prompts[ @@ -216,8 +216,7 @@ def test_model_with_cuda_graph( **hf_kwargs, )) - with vllm_runner(model, dtype=dtype, - enforce_eager=False) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index b079725dd205d..089008967a244 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -324,7 +324,6 @@ def graph_capture_get_metadata_for_batch( use_cuda_graph=True, ) if is_encoder_decoder_model: - print('Here !!!') # The encoder decoder model works only with XFormers backend. # Assert the same. assert self.runner.attn_backend.get_name() == "xformers", \ diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index d09c1ad651a4b..b03c6f9c41697 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -182,13 +182,11 @@ def execute_model( if (model_input.attn_metadata is not None and model_input.attn_metadata.prefill_metadata is None and model_input.attn_metadata.decode_metadata.use_cuda_graph): - print('Executing as cuda graph') assert model_input.input_tokens is not None graph_batch_size = model_input.input_tokens.shape[0] model_executable = self.graph_runners[ model_input.virtual_engine][graph_batch_size] else: - print('Executing without cuda graph') model_executable = self.model seqlen_agnostic_kwargs = { diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6ebf91edfc5e3..0b151a589f257 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -993,6 +993,8 @@ def save_tensorized_model( def get_max_block_per_batch(self) -> int: block_size = self.block_size + print('block_size ' + str(block_size)) + print('max_seq_len_to_capture ' + str(self.max_seq_len_to_capture)) return (self.max_seq_len_to_capture + block_size - 1) // block_size def _prepare_model_input_tensors( diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index b63499210299a..3ff6d3c322dc1 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -47,10 +47,6 @@ def assert_enc_dec_mr_supported_scenario( raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC']) - #if not enc_dec_mr.model_config.enforce_eager: - # raise NotImplementedError( - # STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH']) - if enc_dec_mr.prompt_adapter_config is not None: raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[ 'STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER']) From 727b3f2e96b083b4007a93e96ce789cc75db19a5 Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 04:07:54 +0000 Subject: [PATCH 11/38] Add a new test --- .../test_encoder_decoder_model_runner.py | 152 +++++++++++++++++- vllm/worker/model_runner.py | 2 - 2 files changed, 151 insertions(+), 3 deletions(-) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 14ccf28bae203..da0340b82a23f 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,3 +1,4 @@ +import itertools from array import array from typing import List @@ -7,8 +8,9 @@ from vllm.engine.arg_utils import EngineArgs from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, SequenceData, SequenceGroupMetadata) -from vllm.utils import is_cpu +from vllm.utils import is_cpu, make_tensor_with_pad from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner +from vllm.worker.model_runner import _get_graph_batch_size ENFORCE_EAGER = [True] @@ -481,3 +483,151 @@ def test_prepare_decode( dtype=actual.dtype, ) assert torch.equal(actual, expected) + + +@pytest.mark.parametrize("batch_size", list(range(1, 257))) +def test_prepare_decode_cuda_graph(batch_size): + """ + Tests that for encoder-decoder models with CUDA Graph capture and replay + enabled, the tensors used during the decode phase are correctly padded + for varying input batch sizes. + """ + model_runner = _create_model_runner( + "facebook/bart-base", + seed=0, + dtype="float16", + max_num_batched_tokens=100000, + max_num_seqs=100000, + enable_chunked_prefill=False, + enforce_eager=False, + ) + + seq_lens: List[int] = [] + encoder_seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + block_tables = {0: [1]} + cross_block_table = [2] + for i in range(batch_size): + # make sure all tokens fit into one block + seq_len = i % (model_runner.block_size - 1) + 1 + seq_lens.append(seq_len) + seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(seq_len)))) + encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 + encoder_seq_lens.append(encoder_seq_len) + encoder_seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(encoder_seq_len)))) + seq_group_metadata = SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=False, + seq_data={0: seq_data}, + sampling_params=SamplingParams(temperature=0), + block_tables=block_tables, + encoder_seq_data=encoder_seq_data, + cross_block_table=cross_block_table, + ) + assert seq_group_metadata.token_chunk_size == 1 + seq_group_metadata_list.append(seq_group_metadata) + + model_input = model_runner.prepare_model_input(seq_group_metadata_list) + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + return_seq_lens = model_input.seq_lens + slot_mapping = attn_metadata.slot_mapping + encoder_input_tokens = model_input.encoder_input_tokens + encoder_input_positions = model_input.encoder_input_positions + cross_slot_mapping = attn_metadata.cross_slot_mapping + + # With CUDA Graph capture and replay enabled, the decoder and encoder + # input sequences will be padded. Create the expected padded tensors + # accordingly. + graph_batch_size = _get_graph_batch_size(batch_size) + cuda_graph_pad_size = graph_batch_size - batch_size + padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size)) + padded_encoder_seq_lens = encoder_seq_lens + list( + itertools.repeat(1, cuda_graph_pad_size)) + + assert return_seq_lens == padded_seq_lens + assert len(slot_mapping) == len(input_tokens) + assert len(cross_slot_mapping) == len(encoder_input_tokens) + + # Verify attention metadata + device = model_runner.device + assert attn_metadata.num_prefills == 0 + assert attn_metadata.num_decode_tokens > 0 + assert torch.equal( + attn_metadata.seq_lens_tensor, + torch.tensor(padded_seq_lens, device=device, dtype=torch.int)) + assert attn_metadata.seq_lens == padded_seq_lens + assert attn_metadata.max_prefill_seq_len == 0 + assert attn_metadata.max_decode_seq_len == max(seq_lens) + # - Encoder attention metadata + assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens + assert torch.equal( + attn_metadata.encoder_seq_lens_tensor, + torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int)) + assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens) + assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens) + + # Verify block tables are correct for prompts + # - Decoder self-attention. Pad the block tables as expected. + expected = [block_tables[0] for _ in range(batch_size)] + expected.extend([[] for _ in range(cuda_graph_pad_size)]) + expected = make_tensor_with_pad( + expected, + max_len=64, + pad=0, + dtype=torch.int32, + device=model_runner.device, + ) + assert torch.equal( + attn_metadata.block_tables, + expected, + ) + # - Encoder/decoder cross-attention. Pad the cross-attention block tables + # as expected. + expected = [cross_block_table for _ in range(len(seq_group_metadata_list))] + expected.extend([[] for _ in range(cuda_graph_pad_size)]) + expected = make_tensor_with_pad( + expected, + max_len=64, + pad=0, + dtype=torch.int32, + device=model_runner.device, + ) + assert torch.equal( + attn_metadata.cross_block_tables, + expected, + ) + + # Cuda graph should is currently not supported for encoder/decoer. + assert attn_metadata.use_cuda_graph is True + + # Verify the lengths of input tokens & positions + # - Decoder + assert len(input_tokens) == len(padded_seq_lens) + assert len(input_positions) == len(padded_seq_lens) + # -- An indirect check that model_input.input_tokens + # and model_input.input_positions are correct - + # by design of the test, the input tokens are + # equal to the input position values, so if + # the model_input data structure has the correct + # values then these two should be equal + assert torch.equal( + input_tokens, + input_positions, + ) + # - Encoder + assert len(encoder_input_tokens) == 0 + assert len(encoder_input_tokens) == 0 + # -- An indirect check that model_input.encoder_input_tokens + # and model_input.encoder_input_positions are correct - + # by design of the test, the input tokens are + # equal to the input position values, so if + # the model_input data structure has the correct + # values then these two should be equal + assert torch.equal( + encoder_input_tokens, + encoder_input_positions, + ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0b151a589f257..6ebf91edfc5e3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -993,8 +993,6 @@ def save_tensorized_model( def get_max_block_per_batch(self) -> int: block_size = self.block_size - print('block_size ' + str(block_size)) - print('max_seq_len_to_capture ' + str(self.max_seq_len_to_capture)) return (self.max_seq_len_to_capture + block_size - 1) // block_size def _prepare_model_input_tensors( From e2e16cf545f9d3bdd9acb0256368962536295ff6 Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 04:09:13 +0000 Subject: [PATCH 12/38] Fix Batch Size --- tests/worker/test_encoder_decoder_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index da0340b82a23f..298d750297828 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -14,7 +14,7 @@ ENFORCE_EAGER = [True] -BATCH_SIZES = [1, 4, 6, 16, 64, 256] +BATCH_SIZES = [1, 4, 16, 64, 256] def _create_model_runner(model: str, *args, From 1fb7cc681757c4bae3633946d4b8f68e80518aa9 Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 05:45:39 +0000 Subject: [PATCH 13/38] Fix comments --- vllm/config.py | 4 +++- vllm/engine/arg_utils.py | 21 ++++++++++++--------- vllm/entrypoints/llm.py | 4 +++- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index bf4238260bef4..3b823a399f1e2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -99,7 +99,9 @@ class ModelConfig: to eager mode (DEPRECATED. Use max_seq_len_to_capture instead). max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back - to eager mode + to eager mode. Additionally for encoder-decoder models, if the + sequence length of the encoder input is larger than this, we fall + back to the eager mode. disable_sliding_window: Whether to disable sliding window. If True, we will disable the sliding window functionality of the model. If the model does not support sliding window, this argument is diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7f45c3d06375a..65076d0c30c33 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -425,20 +425,23 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='Always use eager-mode PyTorch. If False, ' 'will use eager mode and CUDA graph in hybrid ' 'for maximal performance and flexibility.') - parser.add_argument('--max-context-len-to-capture', - type=int, - default=EngineArgs.max_context_len_to_capture, - help='Maximum context length covered by CUDA ' - 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode. ' - '(DEPRECATED. Use --max-seq-len-to-capture instead' - ')') + parser.add_argument( + '--max-context-len-to-capture', + type=int, + default=EngineArgs.max_context_len_to_capture, + help='Maximum context length covered by CUDA ' + 'graphs. When a sequence has context length ' + 'larger than this, we fall back to eager mode. ' + '(DEPRECATED. Use --max-seq-len-to-capture instead)') parser.add_argument('--max-seq-len-to-capture', type=int, default=EngineArgs.max_seq_len_to_capture, help='Maximum sequence length covered by CUDA ' 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode.') + 'larger than this, we fall back to eager mode. ' + 'Additionally for encoder-decoder models, if the ' + 'sequence length of the encoder input is larger ' + 'than this, we fall back to the eager mode.') parser.add_argument('--disable-custom-all-reduce', action='store_true', default=EngineArgs.disable_custom_all_reduce, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 31175724c6c79..8b49dc1049d76 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -87,7 +87,9 @@ class LLM: to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead). max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back - to eager mode. + to eager mode. Additionally for encoder-decoder models, if the + sequence length of the encoder input is larger than this, we fall + back to the eager mode. disable_custom_all_reduce: See ParallelConfig **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See :ref:`engine_args`) From 79e3928d631014e71db9dc83bd628e56ab0ce67d Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 05:56:55 +0000 Subject: [PATCH 14/38] Fix formatting --- vllm/engine/arg_utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 65076d0c30c33..463df84f374f9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -425,14 +425,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='Always use eager-mode PyTorch. If False, ' 'will use eager mode and CUDA graph in hybrid ' 'for maximal performance and flexibility.') - parser.add_argument( - '--max-context-len-to-capture', - type=int, - default=EngineArgs.max_context_len_to_capture, - help='Maximum context length covered by CUDA ' - 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode. ' - '(DEPRECATED. Use --max-seq-len-to-capture instead)') + parser.add_argument('--max-context-len-to-capture', + type=int, + default=EngineArgs.max_context_len_to_capture, + help='Maximum context length covered by CUDA ' + 'graphs. When a sequence has context length ' + 'larger than this, we fall back to eager mode. ' + '(DEPRECATED. Use --max-seq-len-to-capture instead' + ')') parser.add_argument('--max-seq-len-to-capture', type=int, default=EngineArgs.max_seq_len_to_capture, From 09f974153aa825653a35324c8f64062b02c0a9ea Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 16:37:59 +0000 Subject: [PATCH 15/38] Fix test to run with CUDA Graph --- tests/models/test_bart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index c4e79ffec99f2..822946a635c21 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -216,7 +216,7 @@ def test_model_with_cuda_graph( **hf_kwargs, )) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) From eb52df89887529a0309d6dece0c154433eced82f Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 17:26:27 +0000 Subject: [PATCH 16/38] fix format --- tests/models/test_bart.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 822946a635c21..0756be3c33636 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -216,7 +216,8 @@ def test_model_with_cuda_graph( **hf_kwargs, )) - with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: + with vllm_runner(model, dtype=dtype, + enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) From f27e84ae6db29bcc456981193f9fd1cb2420a133 Mon Sep 17 00:00:00 2001 From: sourashis Date: Fri, 23 Aug 2024 22:30:24 +0000 Subject: [PATCH 17/38] Dummy commit --- tests/models/test_bart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 0756be3c33636..b87d21db9a39c 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -215,7 +215,6 @@ def test_model_with_cuda_graph( num_logprobs, **hf_kwargs, )) - with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( From ada2d05836021316c2a2bc905024455762e25e26 Mon Sep 17 00:00:00 2001 From: sourashis Date: Sat, 24 Aug 2024 07:51:06 +0000 Subject: [PATCH 18/38] Dummy commit --- tests/models/test_bart.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index b87d21db9a39c..0756be3c33636 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -215,6 +215,7 @@ def test_model_with_cuda_graph( num_logprobs, **hf_kwargs, )) + with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( From 7b94a359692bde3de9e00eb76ce985a92fae71ed Mon Sep 17 00:00:00 2001 From: sourashis Date: Sun, 25 Aug 2024 07:13:16 +0000 Subject: [PATCH 19/38] Format --- tests/models/test_bart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 0756be3c33636..b87d21db9a39c 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -215,7 +215,6 @@ def test_model_with_cuda_graph( num_logprobs, **hf_kwargs, )) - with vllm_runner(model, dtype=dtype, enforce_eager=False) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( From 4b72b1f891e7df8307ea167d8d0136603d21480f Mon Sep 17 00:00:00 2001 From: sourashis Date: Wed, 28 Aug 2024 06:39:47 +0000 Subject: [PATCH 20/38] Addressing comments --- tests/models/test_bart.py | 11 ------- .../test_encoder_decoder_model_runner.py | 29 +++++++------------ vllm/attention/backends/utils.py | 13 --------- vllm/config.py | 4 +-- vllm/entrypoints/llm.py | 4 +-- vllm/utils.py | 5 ---- vllm/worker/model_runner.py | 2 -- 7 files changed, 12 insertions(+), 56 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index b87d21db9a39c..44efa20576e99 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -140,17 +140,6 @@ def test_models( num_logprobs, **hf_kwargs, )) - - # Note: currently encoder/decoder models are only compatible with - # enforce_eager=True. Normally this is not a problem because - # for encoder/decoder models vLLM will - # default to enforce_eager=True if enforce_eager - # is left unspecified. However, the - # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-exisitng - # decoder-only unit tests expect), so when testing an encoder/decoder - # model we must explicitly specify enforce_eager=True in the VllmRunner - # constructor. with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( test_case_prompts, max_tokens, num_logprobs) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 298d750297828..b425613b303b8 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -12,8 +12,6 @@ from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import _get_graph_batch_size -ENFORCE_EAGER = [True] - BATCH_SIZES = [1, 4, 16, 64, 256] @@ -39,8 +37,7 @@ def _create_model_runner(model: str, *args, reason="CPU backend is currently " "unsupported for encoder/ " "decoder models") -@pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER) -def test_empty_seq_group(enforce_eager, ): +def test_empty_seq_group(): """Verify prepare prompt and decode returns empty output for empty seq group list""" @@ -51,7 +48,7 @@ def test_empty_seq_group(enforce_eager, ): max_num_batched_tokens=100000, max_num_seqs=100000, enable_chunked_prefill=False, - enforce_eager=enforce_eager, + enforce_eager=True, ) seq_group_metadata_list: List[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input_tensors( @@ -84,11 +81,7 @@ def test_empty_seq_group(enforce_eager, ): "unsupported for encoder/ " "decoder models") @pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER) -def test_prepare_prompt( - batch_size, - enforce_eager, -): +def test_prepare_prompt(batch_size, ): ''' Test the ability of the encoder/decoder model runner subclass to produce prefill-phase model inputs & attention metadata. @@ -114,7 +107,7 @@ def test_prepare_prompt( max_num_batched_tokens=100000, max_num_seqs=100000, enable_chunked_prefill=False, - enforce_eager=enforce_eager, + enforce_eager=True, ) seq_lens: List[int] = [] @@ -280,11 +273,7 @@ def test_prepare_prompt( "unsupported for encoder/ " "decoder models") @pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER) -def test_prepare_decode( - batch_size, - enforce_eager, -): +def test_prepare_decode(batch_size, ): ''' Test the ability of the encoder/decoder model runner subclass to produce decode-phase model inputs & attention metadata. @@ -310,7 +299,7 @@ def test_prepare_decode( max_num_batched_tokens=100000, max_num_seqs=100000, enable_chunked_prefill=False, - enforce_eager=enforce_eager, + enforce_eager=True, ) seq_lens: List[int] = [] @@ -427,7 +416,8 @@ def test_prepare_decode( expected, ) - # Cuda graph should is currently not supported for encoder/decoer. + # Model runner's CUDAGraph setting should be propagated to attention + # metadata. assert attn_metadata.use_cuda_graph is False # Verify the lengths of input tokens & positions @@ -601,7 +591,8 @@ def test_prepare_decode_cuda_graph(batch_size): expected, ) - # Cuda graph should is currently not supported for encoder/decoer. + # Model runner's CUDAGraph setting should be propagated to attention + # metadata. assert attn_metadata.use_cuda_graph is True # Verify the lengths of input tokens & positions diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 089008967a244..306b84702abfd 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -326,9 +326,6 @@ def graph_capture_get_metadata_for_batch( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ - f" got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -344,11 +341,6 @@ def get_graph_input_buffers( "block_tables": attn_metadata.decode_metadata.block_tables, } if is_encoder_decoder_model: - # The encoder decoder model works only with XFormers backend. - # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ - f" got '{self.runner.attn_backend.get_name()}'" self._add_additonal_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers @@ -363,11 +355,6 @@ def prepare_graph_input_buffers( input_buffers["block_tables"].copy_( attn_metadata.decode_metadata.block_tables, non_blocking=True) if is_encoder_decoder_model: - # The encoder decoder model works only with XFormers backend. - # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ - f" got '{self.runner.attn_backend.get_name()}'" self._prepare_input_buffers_for_enc_dec_model( attn_metadata, input_buffers) diff --git a/vllm/config.py b/vllm/config.py index 74ea2dc2a62a7..009f1a29196e5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -92,9 +92,7 @@ class ModelConfig: enforce_eager: Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. - If None, the user did not specify, so default to False - - except for encoder/decoder models, which currently require - eager mode. + If None, the user did not specify, so default to False. max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode (DEPRECATED. Use max_seq_len_to_capture instead). diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 72cd78de610ad..6b734c6838080 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -138,9 +138,7 @@ def __init__( LLM constructor. Note: if enforce_eager is unset (enforce_eager is None) - it defaults to False for decoder-only models and True - for encoder/decoder models, since encoder/decoder models - do not currently support CUDAGraph. + it defaults to False. ''' if "disable_log_stats" not in kwargs: diff --git a/vllm/utils.py b/vllm/utils.py index 0b7457a70b362..b4f49177c0c97 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -68,10 +68,6 @@ "currently supported with encoder/" "decoder models.") -STR_NOT_IMPL_ENC_DEC_CUDAGRAPH = ("CUDAGraph is not " - "currently supported with encoder/" - "decoder models.") - STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers is the only backend " "currently supported with encoder/" "decoder models.") @@ -92,7 +88,6 @@ "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP, "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, - "STR_NOT_IMPL_ENC_DEC_CUDA_GRAPH": STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER, } diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6e92f6da3c178..a2ffe311fc68b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -855,7 +855,6 @@ def __init__( self.graph_block_tables = np.zeros( (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), dtype=np.int32) - num_attn_heads = self.model_config.get_num_attention_heads( self.parallel_config) self.attn_backend = get_attn_backend( @@ -1632,7 +1631,6 @@ def capture( } if intermediate_inputs is not None: self.input_buffers.update(intermediate_inputs.tensors) - if get_pp_group().is_last_rank: self.output_buffers = { "hidden_states": hidden_or_intermediate_states From 525c541e195f7fdd0ff318bdb8d08a6e4991cdba Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 20:06:58 +0000 Subject: [PATCH 21/38] Addressing comments --- .buildkite/test-pipeline.yaml | 7 ++ tests/conftest.py | 2 + tests/models/test_bart.py | 65 ------------------- .../test_encoder_decoder_model_runner.py | 4 +- vllm/worker/enc_dec_model_runner.py | 3 + 5 files changed, 14 insertions(+), 67 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 235db72eee4b9..5510e651d4857 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -270,6 +270,13 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 +- label: Encoder Decoder tests # 5min + source_file_dependencies: + - vllm/ + - tests/encoder_decoder + commands: + - pytest -v -s encoder_decoder + ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/tests/conftest.py b/tests/conftest.py index d8264f65b6149..667c251463a38 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -703,6 +703,8 @@ def generate_encoder_decoder_greedy_logprobs( use_beam_search=False, max_tokens=max_tokens, logprobs=num_logprobs) + print('max tokens 123 ' + str(max_tokens)) + print('greedy_logprobs_params ' + str(greedy_logprobs_params)) ''' Greedy logprobs generation for vLLM encoder/decoder models ''' diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 44efa20576e99..cbcabd0eaf14f 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -157,68 +157,3 @@ def test_models( name_1="vllm", num_outputs_0_skip_tokens=hf_skip_tokens, ) - - @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) - @pytest.mark.parametrize("dtype", ["bfloat16"]) - @pytest.mark.parametrize("max_tokens", [2048]) - @pytest.mark.parametrize("num_logprobs", [5]) - @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) - def test_model_with_cuda_graph( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - decoder_prompt_type: DecoderPromptType, - ) -> None: - ''' - Test the vLLM BART model for a variety of encoder/decoder input prompts, - by validating it against HuggingFace (HF) BART. - This test is same as test_models above except that for this test we - enable CUDA Graph capture and replay. - ''' - - test_case_prompts = example_encoder_decoder_prompts[ - decoder_prompt_type] - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=False) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) - - hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE - else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index b425613b303b8..a00d46ddeb007 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -81,7 +81,7 @@ def test_empty_seq_group(): "unsupported for encoder/ " "decoder models") @pytest.mark.parametrize("batch_size", BATCH_SIZES) -def test_prepare_prompt(batch_size, ): +def test_prepare_prompt(batch_size): ''' Test the ability of the encoder/decoder model runner subclass to produce prefill-phase model inputs & attention metadata. @@ -273,7 +273,7 @@ def test_prepare_prompt(batch_size, ): "unsupported for encoder/ " "decoder models") @pytest.mark.parametrize("batch_size", BATCH_SIZES) -def test_prepare_decode(batch_size, ): +def test_prepare_decode(batch_size): ''' Test the ability of the encoder/decoder model runner subclass to produce decode-phase model inputs & attention metadata. diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index b03c6f9c41697..f70cf2a1c6c42 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -209,6 +209,9 @@ def execute_model( if not self.is_driver_worker: return [] + if model_input.async_callback is not None: + model_input.async_callback() + # Sample the next token. output: SamplerOutput = self.model.sample( logits=logits, From 785dfc50680daf14a43311af2d494506431f96c6 Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 20:21:34 +0000 Subject: [PATCH 22/38] Fix format --- vllm/worker/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e4cf54584ea03..95bfc4c7a1006 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -676,7 +676,7 @@ def _use_captured_graph(self, return (self.decode_only and not self.runner.model_config.enforce_eager and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] and max_decode_seq_len <= self.runner.max_seq_len_to_capture - and max_encoder_seq_len <= self.runner.max_seq_len_to_capture) + and max_encoder_seq_len <= self.runner.max_seq_len_to_capture and batch_size <= self.runner.max_batchsize_to_capture) def build(self) -> ModelInputForGPU: From 758c8d2c1d28e2495409d81e25eb73b20291537b Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 20:27:11 +0000 Subject: [PATCH 23/38] Add tests --- tests/encoder_decoder/__init__.py | 0 tests/encoder_decoder/test_e2e_correctness.py | 101 ++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 tests/encoder_decoder/__init__.py create mode 100644 tests/encoder_decoder/test_e2e_correctness.py diff --git a/tests/encoder_decoder/__init__.py b/tests/encoder_decoder/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py new file mode 100644 index 0000000000000..a68b032a3a9f7 --- /dev/null +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -0,0 +1,101 @@ +"""Compare the outputs of HF and vLLM for BART models using greedy sampling. + +Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. +""" +from typing import List, Optional, Tuple + +from vllm.utils import is_cpu + +if not is_cpu(): + # CPU backend is not currently supported with encoder/decoder models + # skip test definitions entirely to avoid importing GPU kernel libs + # (xFormers, etc.) + + import pytest + from transformers import AutoModelForSeq2SeqLM + + from vllm.sequence import SampleLogprobs + + from ..conftest import DecoderPromptType + from ..models.utils import check_logprobs_close + + + def vllm_to_hf_output( + vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, + ): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "" + if decoder_prompt_type == DecoderPromptType.NONE: + hf_output_str = "" + hf_output_str + + return output_ids, hf_output_str, out_logprobs + + + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) + @pytest.mark.parametrize("dtype", ["bfloat16"]) + @pytest.mark.parametrize("max_tokens", [128]) + @pytest.mark.parametrize("num_logprobs", [5]) + @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) + @pytest.mark.parametrize("enforce_eager", [True, False]) + def test_encoder_decoder_e2e( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts, + model: str, + dtype: str, + max_tokens: int, + num_logprobs: int, + decoder_prompt_type: DecoderPromptType, + enforce_eager: bool, + ) -> None: + ''' + E2E test for encoder decoder framework. In this test we use the BART + model to test the encoder decoder framework e2e. We compare the output + of the huggingface implementation to that of the vLLM implementation + to verify the correctness of the encoder decoder framework. + ''' + test_case_prompts = example_encoder_decoder_prompts[ + decoder_prompt_type] + + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + test_case_prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + with vllm_runner(model, dtype=dtype, + enforce_eager=enforce_eager) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + test_case_prompts, max_tokens, num_logprobs) + + hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE + else 0) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) From 6f61f976954b53fcb2e075996f9a90705465b46a Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 20:35:26 +0000 Subject: [PATCH 24/38] fix format --- tests/conftest.py | 2 -- tests/encoder_decoder/test_e2e_correctness.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 667c251463a38..d8264f65b6149 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -703,8 +703,6 @@ def generate_encoder_decoder_greedy_logprobs( use_beam_search=False, max_tokens=max_tokens, logprobs=num_logprobs) - print('max tokens 123 ' + str(max_tokens)) - print('greedy_logprobs_params ' + str(greedy_logprobs_params)) ''' Greedy logprobs generation for vLLM encoder/decoder models ''' diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index a68b032a3a9f7..d734fcd6a2792 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -19,7 +19,6 @@ from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close - def vllm_to_hf_output( vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], decoder_prompt_type: DecoderPromptType, @@ -33,7 +32,6 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs - @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) From f93ac1c685f87e7105b01723e5c7a56b44780aec Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 20:59:26 +0000 Subject: [PATCH 25/38] Add comments --- tests/encoder_decoder/test_e2e_correctness.py | 10 +++++----- vllm/worker/model_runner.py | 3 +++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index d734fcd6a2792..3ccf76ebfabf8 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -1,4 +1,4 @@ -"""Compare the outputs of HF and vLLM for BART models using greedy sampling. +"""E2E tests to verify the correctness of the encoder-decoder framework Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. """ @@ -50,10 +50,10 @@ def test_encoder_decoder_e2e( enforce_eager: bool, ) -> None: ''' - E2E test for encoder decoder framework. In this test we use the BART - model to test the encoder decoder framework e2e. We compare the output - of the huggingface implementation to that of the vLLM implementation - to verify the correctness of the encoder decoder framework. + End-to-End (E2E) test for the encoder-decoder framework. + This test evaluates the encoder-decoder functionality using the BART model. + We compare the outputs of the Hugging Face and vLLM implementations + to ensure that both implementations produce consistent and correct results. ''' test_case_prompts = example_encoder_decoder_prompts[ decoder_prompt_type] diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 95bfc4c7a1006..a728fca2a790d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1617,7 +1617,10 @@ def capture( intermediate_tensors=intermediate_inputs, **kwargs, ) + # Wait for the warm up operations to finish before proceeding with + # Graph Capture. torch.cuda.synchronize() + # Capture the graph. self._graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): output_hidden_or_intermediate_states = self.model( From e408a0009a62c3bad918fe2113d9f86878692194 Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 21:22:23 +0000 Subject: [PATCH 26/38] Dummy fix --- vllm/worker/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a728fca2a790d..1063da817ef4a 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1618,7 +1618,7 @@ def capture( **kwargs, ) # Wait for the warm up operations to finish before proceeding with - # Graph Capture. + # Graph Capture torch.cuda.synchronize() # Capture the graph. self._graph = torch.cuda.CUDAGraph() From bf7b4fcf535f6f0712e723742480061c754ba174 Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 21:23:56 +0000 Subject: [PATCH 27/38] Dummy commit --- vllm/worker/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 1063da817ef4a..a728fca2a790d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1618,7 +1618,7 @@ def capture( **kwargs, ) # Wait for the warm up operations to finish before proceeding with - # Graph Capture + # Graph Capture. torch.cuda.synchronize() # Capture the graph. self._graph = torch.cuda.CUDAGraph() From a814a0b26392aebf558b558e0cc6240ad8b191d3 Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 29 Aug 2024 21:26:11 +0000 Subject: [PATCH 28/38] Fix format --- tests/encoder_decoder/test_e2e_correctness.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 3ccf76ebfabf8..8105415a9194a 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -51,9 +51,10 @@ def test_encoder_decoder_e2e( ) -> None: ''' End-to-End (E2E) test for the encoder-decoder framework. - This test evaluates the encoder-decoder functionality using the BART model. - We compare the outputs of the Hugging Face and vLLM implementations - to ensure that both implementations produce consistent and correct results. + This test evaluates the encoder-decoder functionality using the BART + model. We compare the outputs of the Hugging Face and vLLM + implementations to ensure that both implementations produce consistent + and correct results. ''' test_case_prompts = example_encoder_decoder_prompts[ decoder_prompt_type] From 4617e39a966799345dba071b091c3b22a0a3f795 Mon Sep 17 00:00:00 2001 From: sourashis Date: Sun, 1 Sep 2024 04:12:37 +0000 Subject: [PATCH 29/38] Adding back the assertion --- vllm/attention/backends/utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 306b84702abfd..089008967a244 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -326,6 +326,9 @@ def graph_capture_get_metadata_for_batch( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -341,6 +344,11 @@ def get_graph_input_buffers( "block_tables": attn_metadata.decode_metadata.block_tables, } if is_encoder_decoder_model: + # The encoder decoder model works only with XFormers backend. + # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" self._add_additonal_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers @@ -355,6 +363,11 @@ def prepare_graph_input_buffers( input_buffers["block_tables"].copy_( attn_metadata.decode_metadata.block_tables, non_blocking=True) if is_encoder_decoder_model: + # The encoder decoder model works only with XFormers backend. + # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" self._prepare_input_buffers_for_enc_dec_model( attn_metadata, input_buffers) From 2ab85b8cab2c142eee4db9d3f8dc64c9f9b6a8ac Mon Sep 17 00:00:00 2001 From: sourashis Date: Tue, 3 Sep 2024 04:30:09 +0000 Subject: [PATCH 30/38] Dummy commit --- vllm/attention/backends/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 089008967a244..49359e88d05be 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -346,6 +346,7 @@ def get_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. + assert self.runner.attn_backend.get_name() == "xformers", \ f"Expected attn_backend name to be 'xformers', but "\ f" got '{self.runner.attn_backend.get_name()}'" From 0592dc0db169fdab50898abc54c4d10bbe6b93f0 Mon Sep 17 00:00:00 2001 From: sourashis Date: Tue, 3 Sep 2024 04:30:29 +0000 Subject: [PATCH 31/38] Dummy commit --- vllm/attention/backends/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 49359e88d05be..089008967a244 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -346,7 +346,6 @@ def get_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ f"Expected attn_backend name to be 'xformers', but "\ f" got '{self.runner.attn_backend.get_name()}'" From 12f831249acfe27d81035c7e0c316059360d4efa Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 5 Sep 2024 06:03:12 +0000 Subject: [PATCH 32/38] Dummy commit --- vllm/attention/backends/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 089008967a244..4facc310a35ee 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -343,6 +343,7 @@ def get_graph_input_buffers( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } + if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. From 0ea44793605e2ba50f555fe48e07a9c7f4b9797f Mon Sep 17 00:00:00 2001 From: sourashis Date: Thu, 5 Sep 2024 06:06:36 +0000 Subject: [PATCH 33/38] Dummy commit --- vllm/attention/backends/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 4facc310a35ee..089008967a244 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -343,7 +343,6 @@ def get_graph_input_buffers( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } - if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. From 74177512a1f84cab9e8cd3ecd81268b464e30352 Mon Sep 17 00:00:00 2001 From: sourashis Date: Mon, 9 Sep 2024 18:19:08 +0000 Subject: [PATCH 34/38] Dummy --- vllm/attention/backends/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 089008967a244..4facc310a35ee 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -343,6 +343,7 @@ def get_graph_input_buffers( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } + if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. From 12c4af4f0b7b6fc6bdeefff8dbcfaddf57f88d20 Mon Sep 17 00:00:00 2001 From: sourashis Date: Mon, 9 Sep 2024 18:19:30 +0000 Subject: [PATCH 35/38] Dummy --- vllm/attention/backends/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 4facc310a35ee..089008967a244 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -343,7 +343,6 @@ def get_graph_input_buffers( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } - if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. From 7cd2b43bc5f4cb751e752ba79c9d606b82aaba7e Mon Sep 17 00:00:00 2001 From: sourashis Date: Mon, 9 Sep 2024 19:01:20 +0000 Subject: [PATCH 36/38] Dummy Commit to rerun tests --- vllm/attention/backends/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 089008967a244..4facc310a35ee 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -343,6 +343,7 @@ def get_graph_input_buffers( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } + if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. From e4338ce1d699a5c0569968e46fef945f621ebd80 Mon Sep 17 00:00:00 2001 From: sourashis Date: Mon, 9 Sep 2024 19:01:31 +0000 Subject: [PATCH 37/38] Dummy Commit to rerun tests --- vllm/attention/backends/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 4facc310a35ee..089008967a244 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -343,7 +343,6 @@ def get_graph_input_buffers( "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, "block_tables": attn_metadata.decode_metadata.block_tables, } - if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. From 3fb360cc300061000b94fa638d32957d5108fb3a Mon Sep 17 00:00:00 2001 From: sourashis Date: Tue, 17 Sep 2024 06:32:40 +0000 Subject: [PATCH 38/38] Address comments --- tests/encoder_decoder/test_e2e_correctness.py | 162 +++++++++--------- vllm/worker/model_runner.py | 1 - 2 files changed, 80 insertions(+), 83 deletions(-) diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 8105415a9194a..9324a737a779c 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -4,97 +4,95 @@ """ from typing import List, Optional, Tuple -from vllm.utils import is_cpu +import pytest +from transformers import AutoModelForSeq2SeqLM -if not is_cpu(): - # CPU backend is not currently supported with encoder/decoder models - # skip test definitions entirely to avoid importing GPU kernel libs - # (xFormers, etc.) +from vllm.sequence import SampleLogprobs +from vllm.utils import is_cpu - import pytest - from transformers import AutoModelForSeq2SeqLM +from ..conftest import DecoderPromptType +from ..models.utils import check_logprobs_close - from vllm.sequence import SampleLogprobs - from ..conftest import DecoderPromptType - from ..models.utils import check_logprobs_close +def vllm_to_hf_output( + vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, +): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output - def vllm_to_hf_output( - vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], - decoder_prompt_type: DecoderPromptType, - ): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output + hf_output_str = output_str + "" + if decoder_prompt_type == DecoderPromptType.NONE: + hf_output_str = "" + hf_output_str - hf_output_str = output_str + "" - if decoder_prompt_type == DecoderPromptType.NONE: - hf_output_str = "" + hf_output_str + return output_ids, hf_output_str, out_logprobs - return output_ids, hf_output_str, out_logprobs - @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) - @pytest.mark.parametrize("dtype", ["bfloat16"]) - @pytest.mark.parametrize("max_tokens", [128]) - @pytest.mark.parametrize("num_logprobs", [5]) - @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) - @pytest.mark.parametrize("enforce_eager", [True, False]) - def test_encoder_decoder_e2e( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - decoder_prompt_type: DecoderPromptType, - enforce_eager: bool, - ) -> None: - ''' - End-to-End (E2E) test for the encoder-decoder framework. - This test evaluates the encoder-decoder functionality using the BART - model. We compare the outputs of the Hugging Face and vLLM - implementations to ensure that both implementations produce consistent - and correct results. - ''' - test_case_prompts = example_encoder_decoder_prompts[ - decoder_prompt_type] +@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) +@pytest.mark.parametrize("enforce_eager", [True, False]) +@pytest.mark.skipif( + is_cpu(), + reason="CPU backend is not currently supported with encoder/decoder models" +) +def test_encoder_decoder_e2e( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts, + model: str, + dtype: str, + max_tokens: int, + num_logprobs: int, + decoder_prompt_type: DecoderPromptType, + enforce_eager: bool, +) -> None: + ''' + End-to-End (E2E) test for the encoder-decoder framework. + This test evaluates the encoder-decoder functionality using the BART + model. We compare the outputs of the Hugging Face and vLLM + implementations to ensure that both implementations produce consistent + and correct results. + ''' + test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type] - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = ( - hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=enforce_eager) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( + test_case_prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + with vllm_runner(model, dtype=dtype, + enforce_eager=enforce_eager) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + test_case_prompts, max_tokens, num_logprobs) - hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE - else 0) + hf_skip_tokens = (1 + if decoder_prompt_type == DecoderPromptType.NONE else 0) - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6f90dd5ce767b..e8c472df8b5fc 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -800,7 +800,6 @@ def build(self) -> ModelInputForGPU: batch_size, max_decode_seq_len, max_encoder_seq_len=max_encoder_seq_len) - #use_captured_graph = True # If cuda graph can be used, pad tensors accordingly. # See `capture_model` API for more details.