fix: Interactive inputs actually stops, does not ignore stop token (#3057)

grahamking · web-flow · commit 87e6e0529da7 · 2025-09-16T14:39:05.000-04:00
Signed-off-by: Graham King &lt;grahamk@nvidia.com&gt;
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -318,36 +318,19 @@ dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/Tin
 dynamo-run in=http out=auto --router-mode kv
 ```
 
-### echo_full
+### echo
 
-The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.
+The `echo` engine echoes the prompt back as the response.
 
 ```
-dynamo-run in=http out=echo_full --model-name my_model
+dynamo-run in=http out=echo --model-name my_model
 ```
 
-### echo_core
-
-The `echo_core` engine accepts pre-processed requests and echoes the tokens back as the response. This is useful for testing pre-processing functionality as the response includes the full prompt template.
-
-```
-dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
-```
-
-Note that to use it with `in=http` you need to tell the post processor to ignore stop tokens from the template by adding `nvext.ignore_eos` like this:
-```
-curl -N -d '{"nvext": {"ignore_eos": true}, "stream": true, "model": "Qwen2.5-3B-Instruct", "max_completion_tokens": 4096, "messages":[{"role":"user", "content": "Tell me a story" }]}' ...
-```
-
-The default `in=text` sets that for you.
-
-### Echo Configuration
-
-Both echo engines use a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
+The echo engine uses a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
 
 ```
 # Set token echo delay to 1ms (1000 tokens per second)
-DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo_full
+DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo
 ```
 
 The default delay is 10ms, which produces approximately 100 tokens per second.
diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
@@ -204,14 +204,7 @@ impl Flags {
                     );
                 }
             }
-            Output::EchoFull => {}
-            Output::EchoCore => {
-                if !local_model.card().has_tokenizer() {
-                    anyhow::bail!(
-                        "out=echo_core need to find the tokenizer. Pass flag --model-path <path>"
-                    );
-                };
-            }
+            Output::Echo => {}
             #[cfg(feature = "mistralrs")]
             Output::MistralRs => {}
             #[cfg(feature = "llamacpp")]
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
@@ -109,14 +109,9 @@ async fn engine_for(
             // A single static backend, no etcd
             Ok(EngineConfig::StaticRemote(Box::new(local_model)))
         }
-        Output::EchoFull => Ok(EngineConfig::StaticFull {
-            model: Box::new(local_model),
-            engine: dynamo_llm::engines::make_engine_full(),
-            is_static: flags.static_worker,
-        }),
-        Output::EchoCore => Ok(EngineConfig::StaticCore {
-            engine: dynamo_llm::engines::make_engine_core(),
+        Output::Echo => Ok(EngineConfig::StaticFull {
             model: Box::new(local_model),
+            engine: dynamo_llm::engines::make_echo_engine(),
             is_static: flags.static_worker,
         }),
         #[cfg(feature = "mistralrs")]
@@ -213,7 +208,7 @@ fn gguf_default() -> Output {
 
     #[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
     {
-        Output::EchoFull
+        Output::Echo
     }
 }
 
@@ -225,6 +220,6 @@ fn safetensors_default() -> Output {
 
     #[cfg(not(feature = "mistralrs"))]
     {
-        Output::EchoFull
+        Output::Echo
     }
 }
diff --git a/launch/dynamo-run/src/opt.rs b/launch/dynamo-run/src/opt.rs
@@ -5,11 +5,8 @@ use dynamo_runtime::protocols::ENDPOINT_SCHEME;
 use std::fmt;
 
 pub enum Output {
-    /// Accept un-preprocessed requests, echo the prompt back as the response
-    EchoFull,
-
-    /// Accept preprocessed requests, echo the tokens back as the response
-    EchoCore,
+    /// Echos the prompt back as the response
+    Echo,
 
     /// Listen for models on nats/etcd, add/remove dynamically
     Auto,
@@ -44,8 +41,7 @@ impl TryFrom<&str> for Output {
             "llamacpp" | "llama_cpp" => Ok(Output::LlamaCpp),
 
             "mocker" => Ok(Output::Mocker),
-            "echo_full" => Ok(Output::EchoFull),
-            "echo_core" => Ok(Output::EchoCore),
+            "echo" | "echo_full" => Ok(Output::Echo),
 
             "dyn" | "auto" => Ok(Output::Auto),
 
@@ -69,8 +65,7 @@ impl fmt::Display for Output {
             Output::LlamaCpp => "llamacpp",
 
             Output::Mocker => "mocker",
-            Output::EchoFull => "echo_full",
-            Output::EchoCore => "echo_core",
+            Output::Echo => "echo",
 
             Output::Auto => "auto",
             Output::Static(endpoint) => &format!("{ENDPOINT_SCHEME}{endpoint}"),
@@ -82,11 +77,7 @@ impl fmt::Display for Output {
 impl Output {
     #[allow(unused_mut)]
     pub fn available_engines() -> Vec<String> {
-        let mut out = vec![
-            "echo_core".to_string(),
-            "echo_full".to_string(),
-            Output::Mocker.to_string(),
-        ];
+        let mut out = vec!["echo".to_string(), Output::Mocker.to_string()];
         #[cfg(feature = "mistralrs")]
         {
             out.push(Output::MistralRs.to_string());
diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock
diff --git a/lib/bindings/python/rust/llm/entrypoint.rs b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -219,7 +219,7 @@ async fn select_engine(
             // There is no validation for the echo engine
             RsEngineConfig::StaticFull {
                 model: Box::new(local_model),
-                engine: dynamo_llm::engines::make_engine_full(),
+                engine: dynamo_llm::engines::make_echo_engine(),
                 is_static: false,
             }
         }
diff --git a/lib/llm/src/engines.rs b/lib/llm/src/engines.rs
@@ -13,9 +13,6 @@ use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseSt
 use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
 use dynamo_runtime::protocols::annotated::Annotated;
 
-use crate::backend::ExecutionContext;
-use crate::preprocessor::PreprocessedRequest;
-use crate::protocols::common::llm_backend::LLMEngineOutput;
 use crate::protocols::openai::{
     chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
     completions::{NvCreateCompletionRequest, NvCreateCompletionResponse, prompt_to_string},
@@ -65,53 +62,9 @@ pub static TOKEN_ECHO_DELAY: LazyLock<Duration> = LazyLock::new(|| {
     Duration::from_millis(delay_ms)
 });
 
-/// Engine that accepts pre-processed requests and echos the tokens back as the response
-/// The response will include the full prompt template.
-/// Useful for testing pre-processing.
-struct EchoEngineCore {}
-pub fn make_engine_core() -> ExecutionContext {
-    Arc::new(EchoEngineCore {})
-}
-
-#[async_trait]
-impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutput>>, Error>
-    for EchoEngineCore
-{
-    async fn generate(
-        &self,
-        incoming_request: SingleIn<PreprocessedRequest>,
-    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
-        let (request, context) = incoming_request.into_parts();
-        let ctx = context.context();
-
-        let output = stream! {
-            for tok in request.token_ids {
-                tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
-                yield delta_core(tok);
-            }
-            yield Annotated::from_data(LLMEngineOutput::stop());
-        };
-        Ok(ResponseStream::new(Box::pin(output), ctx))
-    }
-}
-
-fn delta_core(tok: u32) -> Annotated<LLMEngineOutput> {
-    let delta = LLMEngineOutput {
-        token_ids: vec![tok],
-        tokens: None,
-        text: None,
-        cum_log_probs: None,
-        log_probs: None,
-        top_logprobs: None,
-        finish_reason: None,
-        index: None,
-    };
-    Annotated::from_data(delta)
-}
-
 /// Engine that accepts un-preprocessed requests and echos the prompt back as the response
 /// Useful for testing ingress such as service-http.
-struct EchoEngineFull {}
+struct EchoEngine {}
 
 /// Validate Engine that verifies request data
 pub struct ValidateEngine<E> {
@@ -164,8 +117,8 @@ pub trait EmbeddingEngine: Send + Sync {
     ) -> Result<ManyOut<Annotated<NvCreateEmbeddingResponse>>, Error>;
 }
 
-pub fn make_engine_full() -> Arc<dyn StreamingEngine> {
-    let engine = EchoEngineFull {};
+pub fn make_echo_engine() -> Arc<dyn StreamingEngine> {
+    let engine = EchoEngine {};
     let data = EngineDispatcher::new(engine);
     Arc::new(data)
 }
@@ -176,7 +129,7 @@ impl
         SingleIn<NvCreateChatCompletionRequest>,
         ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>,
         Error,
-    > for EchoEngineFull
+    > for EchoEngine
 {
     async fn generate(
         &self,
@@ -185,7 +138,9 @@ impl
         let (request, context) = incoming_request.transfer(());
         let ctx = context.context();
         let mut deltas = request.response_generator(ctx.id().to_string());
-        let req = request.inner.messages.into_iter().next_back().unwrap();
+        let Some(req) = request.inner.messages.into_iter().next_back() else {
+            anyhow::bail!("Empty chat messages in request");
+        };
 
         let prompt = match req {
             dynamo_async_openai::types::ChatCompletionRequestMessage::User(user_msg) => {
@@ -223,7 +178,7 @@ impl
         SingleIn<NvCreateCompletionRequest>,
         ManyOut<Annotated<NvCreateCompletionResponse>>,
         Error,
-    > for EchoEngineFull
+    > for EchoEngine
 {
     async fn generate(
         &self,
@@ -256,7 +211,7 @@ impl
         SingleIn<NvCreateEmbeddingRequest>,
         ManyOut<Annotated<NvCreateEmbeddingResponse>>,
         Error,
-    > for EchoEngineFull
+    > for EchoEngine
 {
     async fn generate(
         &self,
diff --git a/lib/llm/src/entrypoint/input/common.rs b/lib/llm/src/entrypoint/input/common.rs
@@ -310,56 +310,3 @@ where
         .link(frontend)?;
     Ok(engine)
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::types::openai::{
-        chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
-        completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
-    };
-
-    const HF_PATH: &str = concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/tests/data/sample-models/mock-llama-3.1-8b-instruct"
-    );
-
-    #[tokio::test]
-    async fn test_build_chat_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> {
-        // Create test model card
-        let card = ModelDeploymentCard::load(HF_PATH, None)?;
-        let engine = crate::engines::make_engine_core();
-
-        // Build pipeline for chat completions
-        let pipeline = build_pipeline::<
-            NvCreateChatCompletionRequest,
-            NvCreateChatCompletionStreamResponse,
-        >(&card, engine, card.tokenizer_hf()?)
-        .await?;
-
-        // Verify pipeline was created
-        assert!(Arc::strong_count(&pipeline) >= 1);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_build_completions_pipeline_core_engine_succeeds() -> anyhow::Result<()> {
-        // Create test model card
-        let card = ModelDeploymentCard::load(HF_PATH, None)?;
-        let engine = crate::engines::make_engine_core();
-
-        // Build pipeline for completions
-        let pipeline = build_pipeline::<NvCreateCompletionRequest, NvCreateCompletionResponse>(
-            &card,
-            engine,
-            card.tokenizer_hf()?,
-        )
-        .await?;
-
-        // Verify pipeline was created
-        assert!(Arc::strong_count(&pipeline) >= 1);
-
-        Ok(())
-    }
-}
diff --git a/lib/llm/src/entrypoint/input/endpoint.rs b/lib/llm/src/entrypoint/input/endpoint.rs
@@ -156,8 +156,8 @@ mod integration_tests {
             .await
             .map_err(|e| anyhow::anyhow!("Failed to create distributed runtime: {}", e))?;
 
-        let engine_config = EngineConfig::StaticCore {
-            engine: crate::engines::make_engine_core(),
+        let engine_config = EngineConfig::StaticFull {
+            engine: crate::engines::make_echo_engine(),
             model: Box::new(
                 crate::local_model::LocalModelBuilder::default()
                     .model_name(Some("test-model".to_string()))
diff --git a/lib/llm/src/entrypoint/input/text.rs b/lib/llm/src/entrypoint/input/text.rs
diff --git a/lib/llm/src/local_model.rs b/lib/llm/src/local_model.rs
diff --git a/lib/runtime/src/system_status_server.rs b/lib/runtime/src/system_status_server.rs

Original file line number	Diff line number	Diff line change
`@@ -109,14 +109,9 @@ async fn engine_for(`
`109`	`109`	`// A single static backend, no etcd`
`110`	`110`	`Ok(EngineConfig::StaticRemote(Box::new(local_model)))`
`111`	`111`	`}`
`112`		`- Output::EchoFull => Ok(EngineConfig::StaticFull {`
`113`		`- model: Box::new(local_model),`
`114`		`- engine: dynamo_llm::engines::make_engine_full(),`
`115`		`- is_static: flags.static_worker,`
`116`		`- }),`
`117`		`- Output::EchoCore => Ok(EngineConfig::StaticCore {`
`118`		`- engine: dynamo_llm::engines::make_engine_core(),`
	`112`	`+ Output::Echo => Ok(EngineConfig::StaticFull {`
`119`	`113`	`model: Box::new(local_model),`
	`114`	`+ engine: dynamo_llm::engines::make_echo_engine(),`
`120`	`115`	`is_static: flags.static_worker,`
`121`	`116`	`}),`
`122`	`117`	`#[cfg(feature = "mistralrs")]`
`@@ -213,7 +208,7 @@ fn gguf_default() -> Output {`
`213`	`208`
`214`	`209`	`#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]`
`215`	`210`	`{`
`216`		`- Output::EchoFull`
	`211`	`+ Output::Echo`
`217`	`212`	`}`
`218`	`213`	`}`
`219`	`214`
`@@ -225,6 +220,6 @@ fn safetensors_default() -> Output {`
`225`	`220`
`226`	`221`	`#[cfg(not(feature = "mistralrs"))]`
`227`	`222`	`{`
`228`		`- Output::EchoFull`
	`223`	`+ Output::Echo`
`229`	`224`	`}`
`230`	`225`	`}`
Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,7 @@ async fn select_engine(`
`219`	`219`	`// There is no validation for the echo engine`
`220`	`220`	`RsEngineConfig::StaticFull {`
`221`	`221`	`model: Box::new(local_model),`
`222`		`- engine: dynamo_llm::engines::make_engine_full(),`
	`222`	`+ engine: dynamo_llm::engines::make_echo_engine(),`
`223`	`223`	`is_static: false,`
`224`	`224`	`}`
`225`	`225`	`}`