Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6c9799a
Move
WoosukKwon May 6, 2023
bacf49c
http_frontend -> frontend
WoosukKwon May 6, 2023
a16c1f3
Move
WoosukKwon May 6, 2023
9b868d1
Move controller
WoosukKwon May 6, 2023
b2ff569
Minor
WoosukKwon May 7, 2023
38b946b
Fix import errors
WoosukKwon May 7, 2023
62c7175
Move controller back to worker
WoosukKwon May 7, 2023
26aafdc
Rename
WoosukKwon May 7, 2023
8419df6
mv
WoosukKwon May 7, 2023
d9520b4
Add __init__.py
WoosukKwon May 7, 2023
61bf05f
Minor
WoosukKwon May 7, 2023
799ce53
Move set_random_seeds
WoosukKwon May 7, 2023
b6f6d4c
Fix imports
WoosukKwon May 7, 2023
a25b37d
Extract out initialize_dummy_weights
WoosukKwon May 7, 2023
e2ea5cc
Minor
WoosukKwon May 7, 2023
de47b95
Minor
WoosukKwon May 7, 2023
724dc90
Fix import errors on parallel utils
WoosukKwon May 7, 2023
fd2647f
Add __init__.py
WoosukKwon May 7, 2023
7755a7a
Fix parallel_utils
WoosukKwon May 7, 2023
a95bd42
Minor
WoosukKwon May 7, 2023
4dc8e9e
Fix weight loading
WoosukKwon May 7, 2023
e6ffa80
Annotate types
WoosukKwon May 7, 2023
da591aa
Fix type
WoosukKwon May 7, 2023
0ae70da
sample -> sampler
WoosukKwon May 7, 2023
f1d2700
Minor
WoosukKwon May 7, 2023
338b2f4
Merge branch 'main' into refactor-arch
WoosukKwon May 7, 2023
3842987
Do not use fast llama tokenizer
WoosukKwon May 8, 2023
acb2855
Merge branch 'main' into tokenizer
WoosukKwon May 9, 2023
be3f6c7
Fix merge errors
WoosukKwon May 9, 2023
729e14b
Add a tracking issue in comment
WoosukKwon May 9, 2023
9fe9fbf
Minor refactoring
WoosukKwon May 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cacheflow/frontend/fastapi_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import ray
from transformers import AutoTokenizer
import uvicorn

from cacheflow.core.server import (Server, add_server_arguments,
process_server_arguments,
initialize_cluster)
from cacheflow.frontend.utils import get_tokenizer
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import Sequence, SequenceGroup
from cacheflow.utils import Counter, get_gpu_memory, get_cpu_memory
Expand Down Expand Up @@ -44,7 +44,7 @@ def __init__(
):
self.block_size = block_size

self.tokenizer = AutoTokenizer.from_pretrained(model)
self.tokenizer = get_tokenizer(model)
self.seq_group_counter = Counter()
self.seq_counter = Counter()
if server_use_ray:
Expand Down
5 changes: 2 additions & 3 deletions cacheflow/frontend/simple_frontend.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import time
from typing import List, Optional, Tuple

from transformers import AutoTokenizer

from cacheflow.frontend.utils import get_tokenizer
from cacheflow.logger import init_logger
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import Sequence, SequenceGroup
Expand All @@ -21,7 +20,7 @@ def __init__(
) -> None:
self.block_size = block_size

self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.tokenizer = get_tokenizer(model_name)
self.seq_group_counter = Counter()
self.seq_counter = Counter()
self.inputs: List[Tuple[SequenceGroup, SamplingParams]] = []
Expand Down
22 changes: 22 additions & 0 deletions cacheflow/frontend/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Union

from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)


_MODEL_TYPES_WITH_SLOW_TOKENIZER = [
# LLaMA fast tokenizer has a bug related to protobuf.
# See https://github.com/WoosukKwon/cacheflow/issues/80#issue-1698550554
"llama",
]


def get_tokenizer(
model_name: str,
*args,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
config = AutoConfig.from_pretrained(model_name)
if config.model_type in _MODEL_TYPES_WITH_SLOW_TOKENIZER:
kwargs["use_fast"] = False
return AutoTokenizer.from_pretrained(model_name, *args, **kwargs)