Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/features/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ th:not(:first-child) {
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | | | ❌ | | | ✅ |

\* Chunked prefill and prefix caching are only applicable to last-token pooling.
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
Expand Down
2 changes: 1 addition & 1 deletion docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ th {
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ |
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
Expand Down
8 changes: 8 additions & 0 deletions tests/entrypoints/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,11 @@ def zephyr_lora_files():
"""Download zephyr LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")


@pytest.fixture(scope="session")
def opt125_lora_files() -> str:
"""Download opt-125m LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(
repo_id="peft-internal-testing/opt-125m-dummy-lora")
34 changes: 28 additions & 6 deletions tests/entrypoints/openai/test_completion_with_prompt_embeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import base64
import io
import json

import openai # use the official client for correctness check
import pytest
Expand All @@ -16,13 +17,15 @@

# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
LORA_SERVING_MODEL_NAME = "opt125m-lora"

CONFIG = AutoConfig.from_pretrained(MODEL_NAME)


@pytest.fixture(scope="module")
def default_server_args() -> list[str]:
return [
@pytest.fixture(scope="module", params=["use-lora"])
def default_server_args(request: pytest.FixtureRequest,
opt125_lora_files: str) -> list[str]:
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
Expand All @@ -35,6 +38,25 @@ def default_server_args() -> list[str]:
"--enable-prompt-embeds",
]

if request.param == "use-lora":
lora_module_1 = {
"name": LORA_SERVING_MODEL_NAME,
"path": opt125_lora_files,
"base_model_name": MODEL_NAME
}

args.extend([
"--enable-lora",
"--lora-module",
json.dumps(lora_module_1),
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
])

return args


EXAMPLE_PROMPTS = [
"Hello, my name is",
Expand Down Expand Up @@ -74,7 +96,7 @@ async def client_with_prompt_embeds(server_with_prompt_embeds):


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
async def test_completions_with_prompt_embeds(
example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
Expand Down Expand Up @@ -179,7 +201,7 @@ async def test_completions_with_prompt_embeds(


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
async def test_completions_errors_with_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
# Test error case: invalid prompt_embeds
Expand All @@ -194,7 +216,7 @@ async def test_completions_errors_with_prompt_embeds(

@pytest.mark.asyncio
@pytest.mark.parametrize("logprobs_arg", [1, 0])
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
async def test_completions_with_logprobs_and_prompt_embeds(
example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -352,10 +352,9 @@ def load_weights(self, weights: Iterable[tuple[str,
return loaded_params


class OPTForCausalLM(nn.Module, SupportsPP):
class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
Comment thread
qthequartermasterman marked this conversation as resolved.
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}

hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
Expand Down