|
16 | 16 |
|
17 | 17 | AudioTuple = Tuple[np.ndarray, int]
|
18 | 18 |
|
| 19 | +VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" |
| 20 | +HF_PLACEHOLDER = "<|audio|>" |
| 21 | + |
19 | 22 |
|
20 | 23 | @pytest.fixture(scope="session")
|
21 |
| -def audio_and_sample_rate(): |
| 24 | +def audio_assets(): |
22 | 25 | from vllm.assets.audio import AudioAsset
|
23 |
| - return AudioAsset("mary_had_lamb").audio_and_sample_rate |
| 26 | + return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] |
24 | 27 |
|
25 | 28 |
|
26 |
| -@pytest.fixture |
27 |
| -def prompts_and_audios(audio_and_sample_rate): |
28 |
| - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| 29 | +@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call")) |
| 30 | +def audio(request): |
| 31 | + from vllm.assets.audio import AudioAsset |
| 32 | + return AudioAsset(request.param) |
29 | 33 |
|
30 |
| - vllm_placeholder = "<|reserved_special_token_0|>" |
31 |
| - hf_placeholder = "<|audio|>" |
32 | 34 |
|
33 |
| - question = "What's in the audio?" |
34 |
| - vllm_prompt = tokenizer.apply_chat_template( |
35 |
| - [{ |
36 |
| - 'role': 'user', |
37 |
| - 'content': f"{vllm_placeholder}\n{question}" |
38 |
| - }], |
39 |
| - tokenize=False, |
40 |
| - add_generation_prompt=True) |
41 |
| - hf_prompt = tokenizer.apply_chat_template( |
42 |
| - [{ |
43 |
| - 'role': 'user', |
44 |
| - 'content': f"{hf_placeholder}\n{question}" |
45 |
| - }], |
46 |
| - tokenize=False, |
47 |
| - add_generation_prompt=True) |
| 35 | +def _get_prompt(audio_count, question, placeholder): |
| 36 | + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| 37 | + placeholder = f"{placeholder}\n" * audio_count |
48 | 38 |
|
49 |
| - return [(vllm_prompt, hf_prompt, audio_and_sample_rate)] |
| 39 | + return tokenizer.apply_chat_template([{ |
| 40 | + 'role': 'user', |
| 41 | + 'content': f"{placeholder}{question}" |
| 42 | + }], |
| 43 | + tokenize=False, |
| 44 | + add_generation_prompt=True) |
50 | 45 |
|
51 | 46 |
|
52 | 47 | def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
@@ -134,15 +129,71 @@ def process(hf_inputs: BatchEncoding):
|
134 | 129 | )
|
135 | 130 |
|
136 | 131 |
|
| 132 | +def run_multi_audio_test( |
| 133 | + vllm_runner: Type[VllmRunner], |
| 134 | + prompts_and_audios: List[Tuple[str, List[AudioTuple]]], |
| 135 | + model: str, |
| 136 | + *, |
| 137 | + dtype: str, |
| 138 | + max_tokens: int, |
| 139 | + num_logprobs: int, |
| 140 | + tensor_parallel_size: int, |
| 141 | + distributed_executor_backend: Optional[str] = None, |
| 142 | +): |
| 143 | + with vllm_runner(model, |
| 144 | + dtype=dtype, |
| 145 | + tensor_parallel_size=tensor_parallel_size, |
| 146 | + distributed_executor_backend=distributed_executor_backend, |
| 147 | + enforce_eager=True, |
| 148 | + limit_mm_per_prompt={ |
| 149 | + "audio": |
| 150 | + max((len(audio) for _, audio in prompts_and_audios)) |
| 151 | + }) as vllm_model: |
| 152 | + vllm_outputs = vllm_model.generate_greedy_logprobs( |
| 153 | + [prompt for prompt, _ in prompts_and_audios], |
| 154 | + max_tokens, |
| 155 | + num_logprobs=num_logprobs, |
| 156 | + audios=[audios for _, audios in prompts_and_audios]) |
| 157 | + |
| 158 | + # The HuggingFace model doesn't support multiple audios yet, so |
| 159 | + # just assert that some tokens were generated. |
| 160 | + assert all(tokens for tokens, *_ in vllm_outputs) |
| 161 | + |
| 162 | + |
137 | 163 | @pytest.mark.parametrize("dtype", ["half"])
|
138 | 164 | @pytest.mark.parametrize("max_tokens", [128])
|
139 | 165 | @pytest.mark.parametrize("num_logprobs", [5])
|
140 |
| -def test_models(hf_runner, vllm_runner, prompts_and_audios, dtype: str, |
141 |
| - max_tokens: int, num_logprobs: int) -> None: |
| 166 | +def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, |
| 167 | + num_logprobs: int) -> None: |
| 168 | + |
| 169 | + vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) |
| 170 | + hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) |
142 | 171 | run_test(
|
143 | 172 | hf_runner,
|
144 | 173 | vllm_runner,
|
145 |
| - prompts_and_audios, |
| 174 | + [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)], |
| 175 | + MODEL_NAME, |
| 176 | + dtype=dtype, |
| 177 | + max_tokens=max_tokens, |
| 178 | + num_logprobs=num_logprobs, |
| 179 | + tensor_parallel_size=1, |
| 180 | + ) |
| 181 | + |
| 182 | + |
| 183 | +@pytest.mark.parametrize("dtype", ["half"]) |
| 184 | +@pytest.mark.parametrize("max_tokens", [128]) |
| 185 | +@pytest.mark.parametrize("num_logprobs", [5]) |
| 186 | +def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, |
| 187 | + max_tokens: int, |
| 188 | + num_logprobs: int) -> None: |
| 189 | + |
| 190 | + vllm_prompt = _get_prompt(len(audio_assets), |
| 191 | + "Describe each of the audios above.", |
| 192 | + VLLM_PLACEHOLDER) |
| 193 | + run_multi_audio_test( |
| 194 | + vllm_runner, |
| 195 | + [(vllm_prompt, [audio.audio_and_sample_rate |
| 196 | + for audio in audio_assets])], |
146 | 197 | MODEL_NAME,
|
147 | 198 | dtype=dtype,
|
148 | 199 | max_tokens=max_tokens,
|
|
0 commit comments