Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions vllm/v1/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import ast
from dataclasses import replace
from importlib.util import find_spec
from typing import Optional
from typing import Optional, Protocol

import numpy as np
import torch
Expand Down Expand Up @@ -33,6 +33,17 @@
PADDING_SLOT_ID = -1


class EagleAttentionMetadata(Protocol):
# Required attributes
num_actual_tokens: int
max_query_len: int
query_start_loc: torch.Tensor
max_seq_len: int
seq_lens: torch.Tensor
block_table: torch.Tensor
slot_mapping: torch.Tensor


class EagleProposer:

def __init__(
Expand Down Expand Up @@ -97,7 +108,7 @@ def __init__(
device=device)

# Determine allowed attention backends once during initialization.
self.allowed_attn_types: tuple[type, ...] = ()
self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...]
if current_platform.is_rocm():
rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
# vllm.v1.attention.backends.rocm_aiter_fa is an optional backend
Expand Down Expand Up @@ -240,10 +251,6 @@ def propose(
# there's a multi-layer MTP module.
assert isinstance(attn_metadata, self.allowed_attn_types)

# The mypy errors are caused because mypy cannot infer the type of
# attn_metadata. We add this assert to help mypy.
assert isinstance(attn_metadata, FlashAttentionMetadata)

# Generate the remaining draft tokens.
draft_token_ids_list = [draft_token_ids]

Expand Down