Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .buildkite/nightly-benchmarks/tests/serving-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,12 @@
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"draft_tensor_parallel_size": 1
}
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
Expand Down
2 changes: 1 addition & 1 deletion docs/source/features/spec_decode.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
```

:::{warning}
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
:::

Then use a client:
Expand Down
10 changes: 6 additions & 4 deletions examples/offline_inference/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,12 @@
max_model_len=max_model_len,
max_num_seqs=args.max_num_seqs,
gpu_memory_utilization=0.8,
speculative_model=eagle_dir,
num_speculative_tokens=args.num_spec_tokens,
speculative_draft_tensor_parallel_size=args.draft_tp,
speculative_max_model_len=max_model_len,
speculative_config={
"model": eagle_dir,
"num_speculative_tokens": args.num_spec_tokens,
"draft_tensor_parallel_size": args.draft_tp,
"max_model_len": max_model_len,
},
disable_log_stats=False,
)

Expand Down
12 changes: 8 additions & 4 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,10 @@ def test_metric_spec_decode(
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
speculative_config={
"model": model,
"num_speculative_tokens": k,
},
) as vllm_model:

# Force log interval to be 0 to catch all metrics.
Expand Down Expand Up @@ -300,8 +302,10 @@ def test_metric_spec_decode_interval(
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
speculative_config={
"model": model,
"num_speculative_tokens": k,
},
enforce_eager=True,
)

Expand Down
6 changes: 4 additions & 2 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ def _initalize_kv_caches_v1(self, vllm_config):
model_info.default,
tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model,
num_speculative_tokens=1 if model_info.speculative_model else None,
speculative_config={
"model": model_info.speculative_model,
"num_speculative_tokens": 1,
} if model_info.speculative_model else None,
trust_remote_code=model_info.trust_remote_code,
load_format="dummy",
hf_overrides=hf_overrides,
Expand Down
80 changes: 70 additions & 10 deletions tests/spec_decode/e2e/test_integration_dist_tp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
tensor parallelism.
"""

import json
from typing import Optional

import pytest
Expand All @@ -28,14 +29,14 @@
@pytest.mark.parametrize("test_llm_kwargs", [
[
"--speculative_config",
str({
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
}),
],
[
"--speculative_config",
str({
json.dumps({
"model": "ngram",
"num_speculative_tokens": 5,
"prompt_lookup_max": 3,
Expand Down Expand Up @@ -88,15 +89,15 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
str({
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
}),
]),
("ibm-granite/granite-3b-code-instruct", [
"--speculative_config",
str({
json.dumps({
"model": "ibm-granite/granite-3b-code-instruct",
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
Expand Down Expand Up @@ -147,20 +148,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
str({
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
str({
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
}),
])])
@pytest.mark.parametrize("logprobs", [None, 2])
@pytest.mark.parametrize("logprobs", [None])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
Expand All @@ -171,9 +172,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
if logprobs:
test_llm_kwargs.extend(
["--disable_logprobs_during_spec_decoding", "False"])
run_equality_correctness_test_tp(model,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0,
logprobs=logprobs)


@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[[
# Skip cuda graph recording for fast test.
"--enforce-eager",
"--tensor_parallel_size",
"2",

# precision
"--dtype",
"bfloat16",
]])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[["--enable-chunked-prefill", "False"],
[
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
"--max-num-seqs", "4"
]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"disable_logprobs": False,
}),
]),
("JackFram/llama-68m", [
"--speculative_config",
json.dumps({
"model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
"draft_tensor_parallel_size": 1,
"disable_logprobs": False,
}),
])])
@pytest.mark.parametrize("logprobs", [2])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2_with_logprobs(
model, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
batch_size: int, seed: int):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp(model,
common_llm_kwargs,
per_test_common_llm_kwargs,
Expand Down
6 changes: 4 additions & 2 deletions tests/spec_decode/e2e/test_integration_dist_tp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
tensor parallelism.
"""

import json

import openai
import pytest
import torch
Expand Down Expand Up @@ -33,7 +35,7 @@
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config",
str({
json.dumps({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"draft_tensor_parallel_size": 1,
Expand Down Expand Up @@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config",
str({
json.dumps({
"model": f"{SPEC_MODEL}",
"num_speculative_tokens": 5,
"max_model_len": 32,
Expand Down
4 changes: 3 additions & 1 deletion tests/v1/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
speculative_model=MODEL,
speculative_config={
"model": MODEL,
},
).create_engine_config()

with pytest.raises(NotImplementedError):
Expand Down
19 changes: 9 additions & 10 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2028,14 +2028,13 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:

def __post_init__(self):

# Note: After next release, the method parameter will be used to
# specify the speculative method, which helps to extend the
# configuration of non-model-based proposers, and the model parameter
# will be used when the draft model or head is needed.
# If users do not specify the method, the speculative method will
# be detected automatically if possible. If the speculative method can
# not be detected, it will be considered as the draft-model-based
# method by default.
# Note: "method" is a new parameter that helps to extend the
# configuration of non-model-based proposers, and the "model" parameter
# will be used to set the draft model, eagle head, or additional weight
# when needed. If users do not specify "method", the speculative method
# will be detected automatically if possible. If the speculative method
# can not be detected, it will be considered as the "draft_model" by
# default.

if self.model is None and self.num_speculative_tokens is not None:
# TODO(Shangming): Refactor mtp configuration logic when supporting
Expand All @@ -2050,8 +2049,8 @@ def __post_init__(self):
raise ValueError("num_speculative_tokens was provided without "
"speculative model.")

# Automatically configure the ngram method during configuration
# refactoring to ensure a smooth transition.
# Automatically configure the method for ngram when "model" is used
# instead of "method"
if self.method is None and (self.model is not None
and self.model in ("ngram", "[ngram]")):
self.method = "ngram"
Expand Down
Loading