WIP/POC commit - may want to split apart into separate PR

t-ob · t-ob · commit 7f2839b315a0 · 2025-06-02T13:32:35.000Z
diff --git a/launch/dynamo-run/src/input/http.rs b/launch/dynamo-run/src/input/http.rs
@@ -32,6 +32,7 @@ pub async fn run(
         .port(flags.http_port)
         .enable_chat_endpoints(true)
         .enable_cmpl_endpoints(true)
+        .enable_embeddings_endpoints(true)
         .with_request_template(template)
         .build()?;
     match engine_config {
diff --git a/launch/dynamo-run/src/subprocess/sglang_inc.py b/launch/dynamo-run/src/subprocess/sglang_inc.py
@@ -77,6 +77,38 @@ async def generate(self, request):
             num_output_tokens_so_far = next_total_toks
 
 
+class EmbeddingRequestHandler(RequestHandler):
+    """
+    Request handler for the embedding endpoint
+    """
+
+    async def generate(self, request):
+        gen = await self.engine_client.async_encode(prompt=request["input"])
+        tokens = 0
+        embeddings = []
+        for idx, res in enumerate(gen):
+            embeddings.append(
+                {
+                    "index": idx,
+                    "object": "embedding",
+                    "embedding": res["embedding"],
+                }
+            )
+            tokens += res["meta_info"]["prompt_tokens"]
+
+        out = {
+            "object": "list",
+            "model": "TODO",
+            "data": embeddings,
+            "usage": {
+                "prompt_tokens": tokens,
+                "total_tokens": tokens,
+            },
+        }
+
+        yield out
+
+
 @dynamo_worker(static=False)
 async def worker(runtime: DistributedRuntime):
     await init(runtime, cmd_line_args())
@@ -94,8 +126,8 @@ async def init(runtime: DistributedRuntime, config: Config):
         "base_gpu_id": config.base_gpu_id,
     }
 
-    if config.kv_block_size:
-        arg_map["page_size"] = config.kv_block_size
+    # if config.kv_block_size:
+    #     arg_map["page_size"] = config.kv_block_size
 
     if config.context_length:
         arg_map["context_length"] = config.context_length
@@ -129,13 +161,18 @@ async def init(runtime: DistributedRuntime, config: Config):
     await component.create_service()
 
     endpoint = component.endpoint(config.endpoint)
-    await register_llm(
-        ModelType.Backend, endpoint, config.model_path, config.model_name
+    model_type = (
+        ModelType.Backend if not engine_args.is_embedding else ModelType.Embedding
     )
+    await register_llm(model_type, endpoint, config.model_path, config.model_name)
 
     # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
     # after the lease is revoked
-    await endpoint.serve_endpoint(RequestHandler(engine_client).generate)
+    await endpoint.serve_endpoint(
+        RequestHandler(engine_client).generate
+        if not engine_args.is_embedding
+        else EmbeddingRequestHandler(engine_client).generate
+    )
 
 
 def cmd_line_args():
@@ -230,7 +267,6 @@ def cmd_line_args():
     config.node_rank = args.node_rank
     config.dist_init_addr = args.dist_init_addr
     config.extra_engine_args = args.extra_engine_args
-
     return config
 
 
diff --git a/lib/llm/src/http/service/service_v2.rs b/lib/llm/src/http/service/service_v2.rs
@@ -75,7 +75,7 @@ pub struct HttpServiceConfig {
     #[builder(default = "true")]
     enable_cmpl_endpoints: bool,
 
-    #[builder(default = "false")]
+    #[builder(default = "true")]
     enable_embeddings_endpoints: bool,
 
     #[builder(default = "None")]
diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -42,9 +42,9 @@ impl HfTokenizerConfigJsonFormatter {
     pub fn new(config: ChatTemplate, mixins: ContextMixins) -> anyhow::Result<Self> {
         let mut env = JinjaEnvironment::default().env();
 
-        let chat_template = config.chat_template.as_ref().ok_or(anyhow::anyhow!(
-            "chat_template field is required in the tokenizer_config.json file"
-        ))?;
+        // let chat_template = config.chat_template.as_ref().ok_or(anyhow::anyhow!(
+        //     "chat_template field is required in the tokenizer_config.json file"
+        // ))?;
 
         // add pycompat
         // todo: should we use this: minijinja_contrib::add_to_environment(&mut env);
@@ -57,40 +57,43 @@ impl HfTokenizerConfigJsonFormatter {
 
         let mut supports_add_generation_prompt = None;
 
-        match &chat_template.0 {
-            Either::Left(x) => {
-                if x.contains("add_generation_prompt") {
-                    tracing::debug!("Chat template contains `add_generation_prompt` key. This model supports add_generation_prompt.");
-                    supports_add_generation_prompt = Some(true);
+        if let Some(chat_template) = config.chat_template.as_ref() {
+            match &chat_template.0 {
+                Either::Left(x) => {
+                    if x.contains("add_generation_prompt") {
+                        tracing::debug!("Chat template contains `add_generation_prompt` key. This model supports add_generation_prompt.");
+                        supports_add_generation_prompt = Some(true);
+                    }
+                    env.add_template_owned("default", x.to_string())?;
+                    env.add_template_owned("tool_use", x.to_string())?;
                 }
-                env.add_template_owned("default", x.to_string())?;
-                env.add_template_owned("tool_use", x.to_string())?;
-            }
-            Either::Right(map) => {
-                for t in map {
-                    for (k, v) in t.iter() {
-                        if v.contains("add_generation_prompt") {
-                            match supports_add_generation_prompt {
-                                Some(true) | None => {
-                                    tracing::debug!("Chat template contains `add_generation_prompt` key. This model supports add_generation_prompt.");
-                                    supports_add_generation_prompt = Some(true);
-                                }
-                                Some(false) => {
-                                    tracing::warn!("Not all templates contain `add_generation_prompt` key. This model does not support add_generation_prompt.");
+                Either::Right(map) => {
+                    for t in map {
+                        for (k, v) in t.iter() {
+                            if v.contains("add_generation_prompt") {
+                                match supports_add_generation_prompt {
+                                    Some(true) | None => {
+                                        tracing::debug!("Chat template contains `add_generation_prompt` key. This model supports add_generation_prompt.");
+                                        supports_add_generation_prompt = Some(true);
+                                    }
+                                    Some(false) => {
+                                        tracing::warn!("Not all templates contain `add_generation_prompt` key. This model does not support add_generation_prompt.");
+                                    }
                                 }
+                            } else {
+                                supports_add_generation_prompt = Some(false);
                             }
-                        } else {
-                            supports_add_generation_prompt = Some(false);
+                            env.add_template_owned(k.to_string(), v.to_string())?;
                         }
-                        env.add_template_owned(k.to_string(), v.to_string())?;
                     }
-                }
-                if env.templates().count() == 0 {
-                    anyhow::bail!("Chat template does not contain a `tool_use` or `default` key. Please ensure it contains at least a `default` key, although `tool_use` should be specified for using tools.");
+                    if env.templates().count() == 0 {
+                        anyhow::bail!("Chat template does not contain a `tool_use` or `default` key. Please ensure it contains at least a `default` key, although `tool_use` should be specified for using tools.");
+                    }
                 }
             }
         }
 
+
         Ok(HfTokenizerConfigJsonFormatter {
             env,
             config,