Skip to content

Commit 87e6e05

Browse files
authored
fix: Interactive inputs actually stops, does not ignore stop token (#3057)
Signed-off-by: Graham King <[email protected]>
1 parent bc29b59 commit 87e6e05

File tree

12 files changed

+68
-192
lines changed

12 files changed

+68
-192
lines changed

docs/guides/dynamo_run.md

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -318,36 +318,19 @@ dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/Tin
318318
dynamo-run in=http out=auto --router-mode kv
319319
```
320320

321-
### echo_full
321+
### echo
322322

323-
The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.
323+
The `echo` engine echoes the prompt back as the response.
324324

325325
```
326-
dynamo-run in=http out=echo_full --model-name my_model
326+
dynamo-run in=http out=echo --model-name my_model
327327
```
328328

329-
### echo_core
330-
331-
The `echo_core` engine accepts pre-processed requests and echoes the tokens back as the response. This is useful for testing pre-processing functionality as the response includes the full prompt template.
332-
333-
```
334-
dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
335-
```
336-
337-
Note that to use it with `in=http` you need to tell the post processor to ignore stop tokens from the template by adding `nvext.ignore_eos` like this:
338-
```
339-
curl -N -d '{"nvext": {"ignore_eos": true}, "stream": true, "model": "Qwen2.5-3B-Instruct", "max_completion_tokens": 4096, "messages":[{"role":"user", "content": "Tell me a story" }]}' ...
340-
```
341-
342-
The default `in=text` sets that for you.
343-
344-
### Echo Configuration
345-
346-
Both echo engines use a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
329+
The echo engine uses a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
347330

348331
```
349332
# Set token echo delay to 1ms (1000 tokens per second)
350-
DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo_full
333+
DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo
351334
```
352335

353336
The default delay is 10ms, which produces approximately 100 tokens per second.

launch/dynamo-run/src/flags.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -204,14 +204,7 @@ impl Flags {
204204
);
205205
}
206206
}
207-
Output::EchoFull => {}
208-
Output::EchoCore => {
209-
if !local_model.card().has_tokenizer() {
210-
anyhow::bail!(
211-
"out=echo_core need to find the tokenizer. Pass flag --model-path <path>"
212-
);
213-
};
214-
}
207+
Output::Echo => {}
215208
#[cfg(feature = "mistralrs")]
216209
Output::MistralRs => {}
217210
#[cfg(feature = "llamacpp")]

launch/dynamo-run/src/lib.rs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,9 @@ async fn engine_for(
109109
// A single static backend, no etcd
110110
Ok(EngineConfig::StaticRemote(Box::new(local_model)))
111111
}
112-
Output::EchoFull => Ok(EngineConfig::StaticFull {
113-
model: Box::new(local_model),
114-
engine: dynamo_llm::engines::make_engine_full(),
115-
is_static: flags.static_worker,
116-
}),
117-
Output::EchoCore => Ok(EngineConfig::StaticCore {
118-
engine: dynamo_llm::engines::make_engine_core(),
112+
Output::Echo => Ok(EngineConfig::StaticFull {
119113
model: Box::new(local_model),
114+
engine: dynamo_llm::engines::make_echo_engine(),
120115
is_static: flags.static_worker,
121116
}),
122117
#[cfg(feature = "mistralrs")]
@@ -213,7 +208,7 @@ fn gguf_default() -> Output {
213208

214209
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
215210
{
216-
Output::EchoFull
211+
Output::Echo
217212
}
218213
}
219214

@@ -225,6 +220,6 @@ fn safetensors_default() -> Output {
225220

226221
#[cfg(not(feature = "mistralrs"))]
227222
{
228-
Output::EchoFull
223+
Output::Echo
229224
}
230225
}

launch/dynamo-run/src/opt.rs

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@ use dynamo_runtime::protocols::ENDPOINT_SCHEME;
55
use std::fmt;
66

77
pub enum Output {
8-
/// Accept un-preprocessed requests, echo the prompt back as the response
9-
EchoFull,
10-
11-
/// Accept preprocessed requests, echo the tokens back as the response
12-
EchoCore,
8+
/// Echos the prompt back as the response
9+
Echo,
1310

1411
/// Listen for models on nats/etcd, add/remove dynamically
1512
Auto,
@@ -44,8 +41,7 @@ impl TryFrom<&str> for Output {
4441
"llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
4542

4643
"mocker" => Ok(Output::Mocker),
47-
"echo_full" => Ok(Output::EchoFull),
48-
"echo_core" => Ok(Output::EchoCore),
44+
"echo" | "echo_full" => Ok(Output::Echo),
4945

5046
"dyn" | "auto" => Ok(Output::Auto),
5147

@@ -69,8 +65,7 @@ impl fmt::Display for Output {
6965
Output::LlamaCpp => "llamacpp",
7066

7167
Output::Mocker => "mocker",
72-
Output::EchoFull => "echo_full",
73-
Output::EchoCore => "echo_core",
68+
Output::Echo => "echo",
7469

7570
Output::Auto => "auto",
7671
Output::Static(endpoint) => &format!("{ENDPOINT_SCHEME}{endpoint}"),
@@ -82,11 +77,7 @@ impl fmt::Display for Output {
8277
impl Output {
8378
#[allow(unused_mut)]
8479
pub fn available_engines() -> Vec<String> {
85-
let mut out = vec![
86-
"echo_core".to_string(),
87-
"echo_full".to_string(),
88-
Output::Mocker.to_string(),
89-
];
80+
let mut out = vec!["echo".to_string(), Output::Mocker.to_string()];
9081
#[cfg(feature = "mistralrs")]
9182
{
9283
out.push(Output::MistralRs.to_string());

lib/bindings/python/Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/bindings/python/rust/llm/entrypoint.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ async fn select_engine(
219219
// There is no validation for the echo engine
220220
RsEngineConfig::StaticFull {
221221
model: Box::new(local_model),
222-
engine: dynamo_llm::engines::make_engine_full(),
222+
engine: dynamo_llm::engines::make_echo_engine(),
223223
is_static: false,
224224
}
225225
}

lib/llm/src/engines.rs

Lines changed: 9 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseSt
1313
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
1414
use dynamo_runtime::protocols::annotated::Annotated;
1515

16-
use crate::backend::ExecutionContext;
17-
use crate::preprocessor::PreprocessedRequest;
18-
use crate::protocols::common::llm_backend::LLMEngineOutput;
1916
use crate::protocols::openai::{
2017
chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
2118
completions::{NvCreateCompletionRequest, NvCreateCompletionResponse, prompt_to_string},
@@ -65,53 +62,9 @@ pub static TOKEN_ECHO_DELAY: LazyLock<Duration> = LazyLock::new(|| {
6562
Duration::from_millis(delay_ms)
6663
});
6764

68-
/// Engine that accepts pre-processed requests and echos the tokens back as the response
69-
/// The response will include the full prompt template.
70-
/// Useful for testing pre-processing.
71-
struct EchoEngineCore {}
72-
pub fn make_engine_core() -> ExecutionContext {
73-
Arc::new(EchoEngineCore {})
74-
}
75-
76-
#[async_trait]
77-
impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutput>>, Error>
78-
for EchoEngineCore
79-
{
80-
async fn generate(
81-
&self,
82-
incoming_request: SingleIn<PreprocessedRequest>,
83-
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
84-
let (request, context) = incoming_request.into_parts();
85-
let ctx = context.context();
86-
87-
let output = stream! {
88-
for tok in request.token_ids {
89-
tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
90-
yield delta_core(tok);
91-
}
92-
yield Annotated::from_data(LLMEngineOutput::stop());
93-
};
94-
Ok(ResponseStream::new(Box::pin(output), ctx))
95-
}
96-
}
97-
98-
fn delta_core(tok: u32) -> Annotated<LLMEngineOutput> {
99-
let delta = LLMEngineOutput {
100-
token_ids: vec![tok],
101-
tokens: None,
102-
text: None,
103-
cum_log_probs: None,
104-
log_probs: None,
105-
top_logprobs: None,
106-
finish_reason: None,
107-
index: None,
108-
};
109-
Annotated::from_data(delta)
110-
}
111-
11265
/// Engine that accepts un-preprocessed requests and echos the prompt back as the response
11366
/// Useful for testing ingress such as service-http.
114-
struct EchoEngineFull {}
67+
struct EchoEngine {}
11568

11669
/// Validate Engine that verifies request data
11770
pub struct ValidateEngine<E> {
@@ -164,8 +117,8 @@ pub trait EmbeddingEngine: Send + Sync {
164117
) -> Result<ManyOut<Annotated<NvCreateEmbeddingResponse>>, Error>;
165118
}
166119

167-
pub fn make_engine_full() -> Arc<dyn StreamingEngine> {
168-
let engine = EchoEngineFull {};
120+
pub fn make_echo_engine() -> Arc<dyn StreamingEngine> {
121+
let engine = EchoEngine {};
169122
let data = EngineDispatcher::new(engine);
170123
Arc::new(data)
171124
}
@@ -176,7 +129,7 @@ impl
176129
SingleIn<NvCreateChatCompletionRequest>,
177130
ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>,
178131
Error,
179-
> for EchoEngineFull
132+
> for EchoEngine
180133
{
181134
async fn generate(
182135
&self,
@@ -185,7 +138,9 @@ impl
185138
let (request, context) = incoming_request.transfer(());
186139
let ctx = context.context();
187140
let mut deltas = request.response_generator(ctx.id().to_string());
188-
let req = request.inner.messages.into_iter().next_back().unwrap();
141+
let Some(req) = request.inner.messages.into_iter().next_back() else {
142+
anyhow::bail!("Empty chat messages in request");
143+
};
189144

190145
let prompt = match req {
191146
dynamo_async_openai::types::ChatCompletionRequestMessage::User(user_msg) => {
@@ -223,7 +178,7 @@ impl
223178
SingleIn<NvCreateCompletionRequest>,
224179
ManyOut<Annotated<NvCreateCompletionResponse>>,
225180
Error,
226-
> for EchoEngineFull
181+
> for EchoEngine
227182
{
228183
async fn generate(
229184
&self,
@@ -256,7 +211,7 @@ impl
256211
SingleIn<NvCreateEmbeddingRequest>,
257212
ManyOut<Annotated<NvCreateEmbeddingResponse>>,
258213
Error,
259-
> for EchoEngineFull
214+
> for EchoEngine
260215
{
261216
async fn generate(
262217
&self,

lib/llm/src/entrypoint/input/common.rs

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -310,56 +310,3 @@ where
310310
.link(frontend)?;
311311
Ok(engine)
312312
}
313-
314-
#[cfg(test)]
315-
mod tests {
316-
use super::*;
317-
use crate::types::openai::{
318-
chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
319-
completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
320-
};
321-
322-
const HF_PATH: &str = concat!(
323-
env!("CARGO_MANIFEST_DIR"),
324-
"/tests/data/sample-models/mock-llama-3.1-8b-instruct"
325-
);
326-
327-
#[tokio::test]
328-
async fn test_build_chat_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> {
329-
// Create test model card
330-
let card = ModelDeploymentCard::load(HF_PATH, None)?;
331-
let engine = crate::engines::make_engine_core();
332-
333-
// Build pipeline for chat completions
334-
let pipeline = build_pipeline::<
335-
NvCreateChatCompletionRequest,
336-
NvCreateChatCompletionStreamResponse,
337-
>(&card, engine, card.tokenizer_hf()?)
338-
.await?;
339-
340-
// Verify pipeline was created
341-
assert!(Arc::strong_count(&pipeline) >= 1);
342-
343-
Ok(())
344-
}
345-
346-
#[tokio::test]
347-
async fn test_build_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> {
348-
// Create test model card
349-
let card = ModelDeploymentCard::load(HF_PATH, None)?;
350-
let engine = crate::engines::make_engine_core();
351-
352-
// Build pipeline for completions
353-
let pipeline = build_pipeline::<NvCreateCompletionRequest, NvCreateCompletionResponse>(
354-
&card,
355-
engine,
356-
card.tokenizer_hf()?,
357-
)
358-
.await?;
359-
360-
// Verify pipeline was created
361-
assert!(Arc::strong_count(&pipeline) >= 1);
362-
363-
Ok(())
364-
}
365-
}

lib/llm/src/entrypoint/input/endpoint.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ mod integration_tests {
156156
.await
157157
.map_err(|e| anyhow::anyhow!("Failed to create distributed runtime: {}", e))?;
158158

159-
let engine_config = EngineConfig::StaticCore {
160-
engine: crate::engines::make_engine_core(),
159+
let engine_config = EngineConfig::StaticFull {
160+
engine: crate::engines::make_echo_engine(),
161161
model: Box::new(
162162
crate::local_model::LocalModelBuilder::default()
163163
.model_name(Some("test-model".to_string()))

0 commit comments

Comments
 (0)