Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/backends/vllm/src/dynamo/vllm/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def parse_args() -> Config:
"--dyn-reasoning-parser",
type=str,
default=None,
help="Reasoning parser name for the model.",
help="Reasoning parser name for the model. Available options: 'basic', 'deepseek_r1', 'gpt_oss'.",
)

parser = AsyncEngineArgs.add_cli_args(parser)
Expand Down
1 change: 0 additions & 1 deletion lib/bindings/python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion lib/llm/src/engines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;

use crate::backend::ExecutionContext;
use crate::local_model::runtime_config;
use crate::preprocessor::PreprocessedRequest;
use crate::protocols::common::llm_backend::LLMEngineOutput;
use crate::protocols::openai::{
Expand Down Expand Up @@ -183,7 +184,7 @@ impl
incoming_request: SingleIn<NvCreateChatCompletionRequest>,
) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
let (request, context) = incoming_request.transfer(());
let mut deltas = request.response_generator();
let mut deltas = request.response_generator(runtime_config::ModelRuntimeConfig::default());
let ctx = context.context();
let req = request.inner.messages.into_iter().next_back().unwrap();

Expand Down
2 changes: 2 additions & 0 deletions lib/llm/src/local_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ impl LocalModelBuilder {
);
card.migration_limit = self.migration_limit;
card.user_data = self.user_data.take();
card.runtime_config = self.runtime_config.clone();

return Ok(LocalModel {
card,
Expand Down Expand Up @@ -276,6 +277,7 @@ impl LocalModelBuilder {

card.migration_limit = self.migration_limit;
card.user_data = self.user_data.take();
card.runtime_config = self.runtime_config.clone();

Ok(LocalModel {
card,
Expand Down
6 changes: 6 additions & 0 deletions lib/llm/src/model_card.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;

use crate::local_model::runtime_config::ModelRuntimeConfig;
use anyhow::{Context, Result};
use derive_builder::Builder;
use dynamo_runtime::{slug::Slug, storage::key_value_store::Versioned, transports::nats};
Expand Down Expand Up @@ -137,6 +138,9 @@ pub struct ModelDeploymentCard {
/// User-defined metadata for custom worker behavior
#[serde(default, skip_serializing_if = "Option::is_none")]
pub user_data: Option<serde_json::Value>,

#[serde(default)]
pub runtime_config: ModelRuntimeConfig,
}

impl ModelDeploymentCard {
Expand Down Expand Up @@ -441,6 +445,7 @@ impl ModelDeploymentCard {
kv_cache_block_size: 0,
migration_limit: 0,
user_data: None,
runtime_config: ModelRuntimeConfig::default(),
})
}

Expand Down Expand Up @@ -482,6 +487,7 @@ impl ModelDeploymentCard {
kv_cache_block_size: 0, // set later
migration_limit: 0,
user_data: None,
runtime_config: ModelRuntimeConfig::default(),
})
}
}
Expand Down
7 changes: 6 additions & 1 deletion lib/llm/src/preprocessor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use std::{collections::HashMap, sync::Arc};
use tracing;

use crate::local_model::runtime_config::ModelRuntimeConfig;
use crate::model_card::{ModelDeploymentCard, ModelInfo, TokenizerKind};
use crate::preprocessor::prompt::OAIChatLikeRequest;
use crate::tokenizers::Encoding;
Expand Down Expand Up @@ -94,6 +95,7 @@ pub struct OpenAIPreprocessor {
formatter: Arc<dyn OAIPromptFormatter>,
tokenizer: Arc<dyn Tokenizer>,
model_info: Arc<dyn ModelInfo>,
runtime_config: ModelRuntimeConfig,
}

impl OpenAIPreprocessor {
Expand Down Expand Up @@ -121,11 +123,14 @@ impl OpenAIPreprocessor {
};
let model_info = model_info.get_model_info().await?;

let runtime_config = mdc.runtime_config.clone();

Ok(Arc::new(Self {
formatter,
tokenizer,
model_info,
mdcsum,
runtime_config,
}))
}

Expand Down Expand Up @@ -494,7 +499,7 @@ impl
let (request, context) = request.into_parts();

// create a response generator
let response_generator = request.response_generator();
let response_generator = request.response_generator(self.runtime_config.clone());
let mut response_generator = Box::new(response_generator);

// convert the chat completion request to a common completion request
Expand Down
19 changes: 15 additions & 4 deletions lib/llm/src/protocols/openai/chat_completions/delta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use dynamo_parsers::{ParserResult, ReasoningParser, ReasoningParserType, Reasoni

use super::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse};
use crate::{
local_model::runtime_config,
protocols::common::{self},
types::TokenIdType,
};
Expand All @@ -15,11 +16,15 @@ impl NvCreateChatCompletionRequest {
///
/// # Returns
/// * [`DeltaGenerator`] configured with model name and response options.
pub fn response_generator(&self) -> DeltaGenerator {
pub fn response_generator(
&self,
runtime_config: runtime_config::ModelRuntimeConfig,
) -> DeltaGenerator {
let options = DeltaGeneratorOptions {
enable_usage: true,
enable_logprobs: self.inner.logprobs.unwrap_or(false)
|| self.inner.top_logprobs.unwrap_or(0) > 0,
runtime_config,
};

DeltaGenerator::new(self.inner.model.clone(), options)
Expand All @@ -33,6 +38,8 @@ pub struct DeltaGeneratorOptions {
pub enable_usage: bool,
/// Determines whether log probabilities should be included in the response.
pub enable_logprobs: bool,

pub runtime_config: runtime_config::ModelRuntimeConfig,
}

/// Generates incremental chat completion responses in a streaming fashion.
Expand Down Expand Up @@ -92,10 +99,14 @@ impl DeltaGenerator {
// This is hardcoded for now, but can be made configurable later.
// TODO: Make parser type configurable once front-end integration is determined
// Change to GptOss to test GptOSS parser
let reasoning_parser_type = ReasoningParserType::Basic;

// Reasoning parser wrapper
let reasoning_parser = reasoning_parser_type.get_reasoning_parser();
let reasoning_parser = ReasoningParserType::get_reasoning_parser_from_name(
options
.runtime_config
.reasoning_parser
.as_deref()
.unwrap_or("basic"),
);

Self {
id: format!("chatcmpl-{}", uuid::Uuid::new_v4()),
Expand Down
29 changes: 17 additions & 12 deletions lib/llm/tests/http-service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,6 @@
use anyhow::Error;
use async_stream::stream;
use dynamo_async_openai::config::OpenAIConfig;
use dynamo_llm::http::{
client::{
GenericBYOTClient, HttpClientConfig, HttpRequestContext, NvCustomClient, PureOpenAIClient,
},
service::{
Metrics,
error::HttpError,
metrics::{Endpoint, FRONTEND_METRIC_PREFIX, RequestType, Status},
service_v2::HttpService,
},
};
use dynamo_llm::protocols::{
Annotated,
codec::SseLineCodec,
Expand All @@ -36,6 +25,21 @@ use dynamo_llm::protocols::{
completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
},
};
use dynamo_llm::{
http::{
client::{
GenericBYOTClient, HttpClientConfig, HttpRequestContext, NvCustomClient,
PureOpenAIClient,
},
service::{
Metrics,
error::HttpError,
metrics::{Endpoint, FRONTEND_METRIC_PREFIX, RequestType, Status},
service_v2::HttpService,
},
},
local_model::runtime_config,
};
use dynamo_runtime::{
CancellationToken,
engine::AsyncEngineContext,
Expand Down Expand Up @@ -95,7 +99,8 @@ impl
let max_tokens = request.inner.max_tokens.unwrap_or(0) as u64;

// let generator = NvCreateChatCompletionStreamResponse::generator(request.model.clone());
let mut generator = request.response_generator();
let mut generator =
request.response_generator(runtime_config::ModelRuntimeConfig::default());

let stream = stream! {
tokio::time::sleep(std::time::Duration::from_millis(max_tokens)).await;
Expand Down
16 changes: 16 additions & 0 deletions lib/parsers/src/reasoning/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,20 @@ impl ReasoningParserType {
},
}
}

pub fn get_reasoning_parser_from_name(name: &str) -> ReasoningParserWrapper {
tracing::debug!("Selected reasoning parser: {}", name);
match name.to_lowercase().as_str() {
"deepseek_r1" => Self::DeepseekR1.get_reasoning_parser(),
"basic" => Self::Basic.get_reasoning_parser(),
"gpt_oss" => Self::GptOss.get_reasoning_parser(),
_ => {
tracing::warn!(
"Unknown reasoning parser type '{}', falling back to Basic Reasoning Parser",
name
);
Self::Basic.get_reasoning_parser()
}
}
}
}
Loading