Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 1 addition & 15 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
from vllm.config import TaskOption, _get_and_verify_dtype
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
Expand Down Expand Up @@ -1010,20 +1010,6 @@ def vllm_runner():
return VllmRunner


def get_tokenizer_pool_config(tokenizer_group_type):
if tokenizer_group_type is None:
return None
if tokenizer_group_type == "ray":
return TokenizerPoolConfig(pool_size=1,
pool_type="ray",
extra_config={})
if isinstance(tokenizer_group_type, type):
return TokenizerPoolConfig(pool_size=1,
pool_type=tokenizer_group_type,
extra_config={})
raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")


@pytest.fixture()
def temporary_enable_log_propagate():
import logging
Expand Down
10 changes: 3 additions & 7 deletions tests/lora/test_tokenizer_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,14 @@

from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group

from ..conftest import get_tokenizer_pool_config
from vllm.transformers_utils.tokenizer_group import TokenizerGroup


@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_group = TokenizerGroup(
tokenizer_id="gpt2",
enable_lora=True,
max_num_seqs=1,
Expand Down Expand Up @@ -60,8 +57,7 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path):
@pytest.mark.parametrize("max_num_seqs", [1, 2])
@pytest.mark.parametrize("max_loras", [1, 2])
def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(None),
tokenizer_group = TokenizerGroup(
tokenizer_id="gpt2",
enable_lora=enable_lora,
max_num_seqs=max_num_seqs,
Expand Down
9 changes: 2 additions & 7 deletions tests/tokenization/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.inputs import token_inputs
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
Expand Down Expand Up @@ -212,7 +212,7 @@ def test_oov_decode(tokenizer, fast):

@pytest.fixture
def detokenizer(tokenizer_name: str) -> Detokenizer:
init_kwargs = dict(
tokenizer_group = TokenizerGroup(
tokenizer_id=tokenizer_name,
enable_lora=False,
max_num_seqs=100,
Expand All @@ -222,11 +222,6 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
revision=None,
)

tokenizer_group = get_tokenizer_group(
None,
**init_kwargs,
)

return Detokenizer(tokenizer_group)


Expand Down
187 changes: 3 additions & 184 deletions tests/tokenization/test_tokenizer_group.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,15 @@
# SPDX-License-Identifier: Apache-2.0

import asyncio
import os
import sys
from typing import Optional
from unittest.mock import patch

import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase

from vllm.transformers_utils.tokenizer_group import (TokenizerGroup,
get_tokenizer_group)
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool)

from ..conftest import get_tokenizer_pool_config


class CustomTokenizerGroup(TokenizerGroup):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._i = 0

def encode(self, *args, **kwargs):
self._i += 1
return super().encode(*args, **kwargs)
from vllm.transformers_utils.tokenizer_group import TokenizerGroup


@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type",
[None, "ray", CustomTokenizerGroup])
async def test_tokenizer_group(tokenizer_group_type):
async def test_tokenizer_group():
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_group = TokenizerGroup(
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
Expand All @@ -49,159 +24,3 @@ async def test_tokenizer_group(tokenizer_group_type):
PreTrainedTokenizerBase)
assert tokenizer_group.get_lora_tokenizer(
None) == await tokenizer_group.get_lora_tokenizer_async(None)
if tokenizer_group_type is CustomTokenizerGroup:
assert tokenizer_group._i > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
async def test_tokenizer_group_pool(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer_group_pool = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=None,
)
# Send multiple requests to the tokenizer group pool
# (more than the pool size)
# and check that all requests are processed correctly.
num_requests = tokenizer_group_pool.pool_size * 5
requests = [
tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
lora_request=None)
for i in range(num_requests)
]
results = await asyncio.gather(*requests)
expected_results = [
reference_tokenizer.encode(f"prompt {i}") for i in range(num_requests)
]
assert results == expected_results


@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
async def test_tokenizer_group_ray_pool_env_var_propagation(
tokenizer_group_type):
"""Test that env vars from caller process are propagated to
tokenizer Ray actors."""
env_var = "MY_ENV_VAR"

class EnvVarCheckerTokenizerGroup(TokenizerGroup):

def ping(self):
assert os.environ.get(env_var) == "1"
return super().ping()

class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
_worker_cls = EnvVarCheckerTokenizerGroup

tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
tokenizer_pool_config,
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=None)
with pytest.raises(AssertionError):
tokenizer_pool.ping()

with patch.dict(os.environ, {env_var: "1"}):
tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
tokenizer_pool_config,
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=None)
tokenizer_pool.ping()


@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
"""Test that Ray tokenizer pool group can recover from failures and
if that's not possible, mark itself as unhealthy."""

class FailingTokenizerGroup(TokenizerGroup):

def __init__(self,
*args,
fail_at: Optional[list[int]] = None,
**kwargs):
super().__init__(*args, **kwargs)
self.i = 0
self.fail_at = fail_at or []

def encode(self, *args, **kwargs):
self.i += 1
if self.i in self.fail_at:
sys.exit(1)
return super().encode(*args, **kwargs)

class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
_worker_cls = FailingTokenizerGroup

# Fail at first iteration
fail_at = [1]
tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
tokenizer_pool_config,
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=None,
fail_at=fail_at)
tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()

# Modify fail at to not fail at all (will be re-read when actor is
# re-initialized).
fail_at[0] = 1000

# We should recover successfully.
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)

# Check that we have a new actor
assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors

# Fail at first iteration
fail_at = [1]
tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
tokenizer_pool_config,
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=None,
fail_at=fail_at)

# We should fail after re-initialization.
with pytest.raises(RuntimeError):
await tokenizer_group_pool.encode_async(prompt="prompt",
lora_request=None)

# check_health should raise the same thing
with pytest.raises(RuntimeError):
tokenizer_group_pool.check_health()

# Ensure that non-ActorDiedErrors are still propagated correctly and do not
# cause a re-initialization.
fail_at = []
tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
tokenizer_pool_config,
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_input_length=2,
fail_at=fail_at)
tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()

# Prompt too long error
with pytest.raises(ValueError):
await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
lora_request=None)
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
# Actors should stay the same.
assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
2 changes: 1 addition & 1 deletion tests/v1/engine/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
tokenizer=tokenizer,
tokenizer_group=init_tokenizer_from_configs(
vllm_config.model_config, vllm_config.scheduler_config,
vllm_config.parallel_config, vllm_config.lora_config),
vllm_config.lora_config),
vllm_config=vllm_config,
full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
prompt_tokens=prompt_tokens,
Expand Down
5 changes: 2 additions & 3 deletions tests/v1/engine/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

from vllm.engine.arg_utils import EngineArgs
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.v1.engine import EngineCoreOutput, FinishReason
from vllm.v1.outputs import LogprobsLists, LogprobsTensors

Expand Down Expand Up @@ -296,7 +295,7 @@ def generate_dummy_prompt_logprobs_tensors(
class DummyOutputProcessorTestVectors:
"""Dummy test vectors for output processor tests"""
tokenizer: GeneralTokenizerType
tokenizer_group: BaseTokenizerGroup
tokenizer_group: TokenizerGroup
vllm_config: EngineArgs
full_tokens: list[list[int]] # Prompt + generated tokens
prompt_tokens: list[list[int]]
Expand Down
Loading