Skip to content
Merged
17 changes: 16 additions & 1 deletion fastdeploy/engine/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ class CompletionOutput:
decode_type: int = 0
logprob: Optional[float] = None
top_logprobs: Optional[LogprobsLists] = None
draft_top_logprobs: Optional[LogprobsLists] = None
logprobs: Optional[SampleLogprobs] = None
draft_token_ids: list[int] = None
text: Optional[str] = None
Expand All @@ -322,9 +323,9 @@ def to_dict(self):
"index": self.index,
"send_idx": self.send_idx,
"token_ids": self.token_ids,
"decode_type": self.decode_type,
"logprob": self.logprob,
"top_logprobs": self.top_logprobs,
"draft_top_logprobs": self.draft_top_logprobs,
"logprobs": self.logprobs,
"draft_token_ids": self.draft_token_ids,
"text": self.text,
Expand All @@ -350,6 +351,8 @@ def __repr__(self) -> str:
f"draft_token_ids={self.draft_token_ids}, "
f"reasoning_content={self.reasoning_content!r}, "
f"logprobs={self.logprobs}, "
f"top_logprobs={self.top_logprobs}, "
f"draft_top_logprobs={self.draft_top_logprobs}, "
)


Expand Down Expand Up @@ -434,6 +437,7 @@ def __init__(
request_id: str,
prompt: Optional[str] = None,
prompt_token_ids: Optional[list[int]] = None,
output_type: Optional[int] = 3,
outputs: CompletionOutput = None,
finished: bool = False,
metrics: Optional[RequestMetrics] = None,
Expand All @@ -444,6 +448,7 @@ def __init__(
self.request_id = request_id
self.prompt = prompt
self.prompt_token_ids = prompt_token_ids
self.output_type = output_type
self.outputs = outputs
self.finished = finished
self.metrics = metrics
Expand Down Expand Up @@ -472,12 +477,21 @@ def add(self, next_output: RequestOutput) -> None:
self.outputs.top_logprobs.logprob_token_ids.extend(next_output.outputs.top_logprobs.logprob_token_ids)
self.outputs.top_logprobs.logprobs.extend(next_output.outputs.top_logprobs.logprobs)
self.outputs.top_logprobs.sampled_token_ranks.extend(next_output.outputs.top_logprobs.sampled_token_ranks)
if next_output.outputs.draft_top_logprobs is not None:
self.outputs.draft_top_logprobs.logprob_token_ids.extend(
next_output.outputs.draft_top_logprobs.logprob_token_ids
)
self.outputs.draft_top_logprobs.logprobs.extend(next_output.outputs.draft_top_logprobs.logprobs)
self.outputs.draft_top_logprobs.sampled_token_ranks.extend(
next_output.outputs.draft_top_logprobs.sampled_token_ranks
)

def __repr__(self) -> str:
return (
f"RequestOutput(request_id={self.request_id}, "
f"prompt={self.prompt!r}, "
f"prompt_token_ids={self.prompt_token_ids}, "
f"output_type={self.output_type}, "
f"outputs={self.outputs}, "
f"finished={self.finished}, "
f"num_cached_tokens={self.num_cached_tokens}, "
Expand All @@ -498,6 +512,7 @@ def to_dict(self):
"request_id": self.request_id,
"prompt": self.prompt,
"prompt_token_ids": self.prompt_token_ids,
"output_type": self.output_type,
"outputs": None if self.outputs is None else self.outputs.to_dict(),
"metrics": None if self.metrics is None else self.metrics.to_dict(),
"finished": self.finished,
Expand Down
6 changes: 6 additions & 0 deletions fastdeploy/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
logprobs: Optional[LogProbs] = None
draft_logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]


Expand Down Expand Up @@ -265,6 +266,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
logprobs: Optional[LogProbs] = None
draft_logprobs: Optional[LogProbs] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
arrival_time: Optional[float] = None

Expand Down Expand Up @@ -295,6 +297,7 @@ class CompletionResponseChoice(BaseModel):
completion_tokens: Optional[str] = None
arrival_time: Optional[float] = None
logprobs: Optional[CompletionLogprobs] = None
draft_logprobs: Optional[CompletionLogprobs] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
Expand Down Expand Up @@ -333,6 +336,7 @@ class CompletionResponseStreamChoice(BaseModel):
text: str
arrival_time: float = None
logprobs: Optional[CompletionLogprobs] = None
draft_logprobs: Optional[CompletionLogprobs] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
prompt_tokens: Optional[str] = None
Expand Down Expand Up @@ -420,6 +424,7 @@ class CompletionRequest(BaseModel):
echo: Optional[bool] = False
frequency_penalty: Optional[float] = Field(default=None, ge=-2, le=2)
logprobs: Optional[int] = None
include_draft_logprobs: Optional[bool] = False
# For logits and logprobs post processing
temp_scaled_logprobs: bool = False
top_p_normalized_logprobs: bool = False
Expand Down Expand Up @@ -555,6 +560,7 @@ class ChatCompletionRequest(BaseModel):
frequency_penalty: Optional[float] = Field(None, le=2, ge=-2)
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
include_draft_logprobs: Optional[bool] = False

# For logits and logprobs post processing
temp_scaled_logprobs: bool = False
Expand Down
20 changes: 20 additions & 0 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,18 @@ async def chat_completion_stream_generator(

output = res["outputs"]
output_top_logprobs = output["top_logprobs"]
output_draft_top_logprobs = output["draft_top_logprobs"]
previous_num_tokens[idx] += len(output["token_ids"])
logprobs_res: Optional[LogProbs] = None
draft_logprobs_res: Optional[LogProbs] = None
if request.logprobs and output_top_logprobs is not None:
logprobs_res = self._create_chat_logprobs(
output_top_logprobs, request.logprobs, request.top_logprobs
)
if request.include_draft_logprobs and output_draft_top_logprobs is not None:
draft_logprobs_res = self._create_chat_logprobs(
output_draft_top_logprobs, request.logprobs, request.top_logprobs
)

delta_message = DeltaMessage(
reasoning_content="",
Expand All @@ -348,6 +354,7 @@ async def chat_completion_stream_generator(
index=idx,
delta=delta_message,
logprobs=logprobs_res,
draft_logprobs=draft_logprobs_res,
arrival_time=arrival_time,
)
if res["finished"]:
Expand Down Expand Up @@ -444,7 +451,9 @@ async def chat_completion_full_generator(
dealer.write([b"", rid.encode("utf-8")])
previous_num_tokens = [0] * num_choices
current_waiting_time = 0

logprob_contents = [[] for _ in range(num_choices)]
draft_logprob_contents = [[] for _ in range(num_choices)]
completion_token_ids = [[] for _ in range(num_choices)]
num_cached_tokens = [0] * num_choices
response_processor = ChatResponseProcessor(
Expand Down Expand Up @@ -492,12 +501,23 @@ async def chat_completion_full_generator(
# The logprob for handling the response
output = data["outputs"]
output_top_logprobs = output["top_logprobs"]
output_draft_top_logprobs = output["draft_top_logprobs"]
if output_top_logprobs is not None:
# logprobs
logprobs_res = self._create_chat_logprobs(
output_top_logprobs, request.logprobs, request.top_logprobs
)
if logprobs_res and logprobs_res.content is not None:
logprob_contents[idx].extend(logprobs_res.content)

# draft_logprobs
if request.include_draft_logprobs and output_draft_top_logprobs is not None:
draft_logprobs_res = self._create_chat_logprobs(
output_draft_top_logprobs, request.logprobs, request.top_logprobs
)
if draft_logprobs_res and draft_logprobs_res.content is not None:
draft_logprob_contents[idx].extend(draft_logprobs_res.content)

if data["finished"]:
num_choices -= 1
choice = await self._create_chat_completion_choice(
Expand Down
30 changes: 28 additions & 2 deletions fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ async def completion_full_generator(
valid_results = [dict()] * num_choices
output_tokens = [0] * num_choices
aggregated_top_logprobs = [[[], [], []] for _ in range(num_choices)]
aggregated_draft_top_logprobs = [[[], [], []] for _ in range(num_choices)]
aggregated_token_ids = [[] for _ in range(num_choices)]
completion_batched_token_ids = [[] for _ in range(num_choices)]
current_waiting_time = 0
Expand Down Expand Up @@ -266,12 +267,19 @@ async def completion_full_generator(
raise ValueError("{}".format(data["error_msg"]))

output = data["outputs"]
output_top_logprobs = output["top_logprobs"]
output_top_logprobs = output.get("top_logprobs") or None
output_draft_top_logprobs = output.get("draft_top_logprobs") or None
if output_top_logprobs is not None:
aggregated_top_logprobs[rid][0].extend(output_top_logprobs[0])
aggregated_top_logprobs[rid][1].extend(output_top_logprobs[1])
aggregated_top_logprobs[rid][2].extend(output_top_logprobs[2])

# draft logprobs
if request.include_draft_logprobs and output_draft_top_logprobs is not None:
aggregated_draft_top_logprobs[rid][0].extend(output_draft_top_logprobs[0])
aggregated_draft_top_logprobs[rid][1].extend(output_draft_top_logprobs[1])
aggregated_draft_top_logprobs[rid][2].extend(output_draft_top_logprobs[2])

aggregated_token_ids[rid].extend(data["outputs"]["token_ids"])

self.engine_client.data_processor.process_response_dict(
Expand All @@ -282,6 +290,7 @@ async def completion_full_generator(
if data.get("finished", False):
data["output_token_ids"] = output_tokens[rid]
data["outputs"]["top_logprobs"] = aggregated_top_logprobs[rid]
data["outputs"]["draft_top_logprobs"] = aggregated_draft_top_logprobs[rid]
data["outputs"]["token_ids"] = aggregated_token_ids[rid]
valid_results[rid] = data
num_choices -= 1
Expand Down Expand Up @@ -437,10 +446,17 @@ async def completion_stream_generator(
await self._process_echo_logic(request, idx, res["outputs"])
output = res["outputs"]
output_top_logprobs = output["top_logprobs"]
output_draft_top_logprobs = output["draft_top_logprobs"]
logprobs_res: Optional[CompletionLogprobs] = None
draft_logprobs_res: Optional[CompletionLogprobs] = None
if request.logprobs and output_top_logprobs is not None:
logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)

# draft logprobs
if request.include_draft_logprobs and output_draft_top_logprobs is not None:
draft_logprobs_res = self._create_completion_logprobs(
output_draft_top_logprobs, request.logprobs, 0
)
output_tokens[idx] += 1
delta_message = CompletionResponseStreamChoice(
index=idx,
Expand All @@ -452,6 +468,7 @@ async def completion_stream_generator(
reasoning_content="",
arrival_time=arrival_time,
logprobs=logprobs_res,
draft_logprobs=draft_logprobs_res,
)
if not res["finished"] and "delta_message" in output:
delta_message_output = output["delta_message"]
Expand Down Expand Up @@ -541,15 +558,23 @@ def request_output_to_completion_response(
final_res = final_res_batch[idx]
prompt_token_ids = prompt_batched_token_ids[idx // (1 if request.n is None else request.n)]
assert prompt_token_ids is not None
prompt_text = request.prompt
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable prompt_text is assigned here but immediately reassigned at line 568 if request.echo is true. Consider moving this assignment inside the else block of the echo condition to avoid unnecessary assignment.

Copilot uses AI. Check for mistakes.
completion_token_ids = completion_batched_token_ids[idx]

output = final_res["outputs"]
output_top_logprobs = output["top_logprobs"]
output_top_logprobs = output.get("top_logprobs") or None
output_draft_top_logprobs = output.get("draft_top_logprobs") or None

aggregated_logprobs: Optional[CompletionLogprobs] = None
if output_top_logprobs is not None:
aggregated_logprobs = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)

aggregated_draft_logprobs: Optional[CompletionLogprobs] = None
if output_draft_top_logprobs is not None:
aggregated_draft_logprobs = self._create_completion_logprobs(
output_draft_top_logprobs, request.logprobs, 0
)

if request.echo:
prompt_text = self._echo_back_prompt(request, idx // (1 if request.n is None else request.n))
token_ids = [*prompt_token_ids, *output["token_ids"]]
Expand All @@ -574,6 +599,7 @@ def request_output_to_completion_response(
reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call"),
logprobs=aggregated_logprobs,
draft_logprobs=aggregated_draft_logprobs,
finish_reason=finish_reason,
)
choices.append(choice_data)
Expand Down
Loading
Loading