Skip to content

Commit

Permalink
[Bugfix][Hardware][AMD][Frontend] add quantization param to embedding…
Browse files Browse the repository at this point in the history
… checking method (vllm-project#7513)
  • Loading branch information
gongdao123 authored and kylesayrs committed Aug 17, 2024
1 parent ee42d08 commit dcc1df3
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,13 @@
_running_tasks: Set[asyncio.Task] = set()


def model_is_embedding(model_name: str, trust_remote_code: bool) -> bool:
def model_is_embedding(model_name: str, trust_remote_code: bool,
quantization: str) -> bool:
return ModelConfig(model=model_name,
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=trust_remote_code,
quantization=quantization,
seed=0,
dtype="auto").embedding_mode

Expand Down Expand Up @@ -97,7 +99,8 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:

# If manually triggered or embedding model, use AsyncLLMEngine in process.
# TODO: support embedding model via RPC.
if (model_is_embedding(args.model, args.trust_remote_code)
if (model_is_embedding(args.model, args.trust_remote_code,
args.quantization)
or args.disable_frontend_multiprocessing):
async_engine_client = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
Expand Down

0 comments on commit dcc1df3

Please sign in to comment.