From 29b81602cb338b0ffcbf8a2b1b7941b0a429fd13 Mon Sep 17 00:00:00 2001
From: Graham King <grahamk@nvidia.com>
Date: Wed, 12 Mar 2025 14:55:02 -0400
Subject: [PATCH 1/2] feat(pystr): Pass command line arguments

Command line arguments are passed to the python engine like this:
```
dynamo-run out=pystr:my_python_engine.py -- -n 42 --custom-arg Orange --yes
```

The python engine has two options for receiving those command line arguments (both of which are optional).

In both cases the argument list will include some standard ones as well as anything after the `--`.

This input:
```
dynamo-run out=pystr:my_engine.py /opt/models/Llama-3.2-3B-Instruct/ --model-name llama_3.2 --tensor-parallel-size 4 -- -n 1
```

1. In `sys.argv`:

```
async def generate(request):
    .. as before ..

if __name__ == "__main__":
    print(f"MAIN: {sys.argv}")
```

Produces this output:
```
MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
```

This form allows quick iteration on the engine setup.

2. In an `initialize` function:

```
async def generate(request):
    .. as before ..

def initialize(args: list[str]) -> None:
    print(f"initialize: {args}")
```

Produces this output:
```
initialize: ['--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
```

Note how in both cases the `-n` `1` is included.

Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`.
---
 launch/dynamo-run/README.md    | 52 +++++++++++++++++++++++++++++++
 launch/dynamo-run/src/flags.rs | 50 +++++++++++++++++++++++++++---
 launch/dynamo-run/src/lib.rs   | 10 ++++--
 lib/llm/src/engines/python.rs  | 56 +++++++++++++++++++++++++---------
 4 files changed, 146 insertions(+), 22 deletions(-)

diff --git a/launch/dynamo-run/README.md b/launch/dynamo-run/README.md
index 5503832807..efbdf996d8 100644
--- a/launch/dynamo-run/README.md
+++ b/launch/dynamo-run/README.md
@@ -211,6 +211,56 @@ async def generate(request):
     yield {"id":"1","choices":[{"index":0,"delta":{"content":"","role":"assistant"},"finish_reason":"stop"}],"created":1841762283,"model":"Llama-3.2-1B-Instruct","system_fingerprint":"local","object":"chat.completion.chunk"}
 ```
 
+Command line arguments are passed to the python engine like this:
+```
+dynamo-run out=pystr:my_python_engine.py -- -n 42 --custom-arg Orange --yes
+```
+
+The python engine has two options for receiving those command line arguments (both of which are optional).
+
+In both cases the argument list will include some standard ones as well as anything after the `--`.
+
+This input:
+```
+dynamo-run out=pystr:my_engine.py /opt/models/Llama-3.2-3B-Instruct/ --model-name llama_3.2 --tensor-parallel-size 4 -- -n 1
+```
+
+1. In `sys.argv`:
+
+```
+async def generate(request):
+    .. as before ..
+
+if __name__ == "__main__":
+    print(f"MAIN: {sys.argv}")
+```
+
+Produces this output:
+```
+MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
+```
+
+This form allows quick iteration on the engine setup.
+
+2. In an `initialize` function:
+
+```
+async def generate(request):
+    .. as before ..
+
+def initialize(args: list[str]) -> None:
+    print(f"initialize: {args}")
+```
+
+Produces this output:
+```
+initialize: ['--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
+```
+
+Note how in both cases the `-n` `1` is included.
+
+Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`.
+
 ### Dynamo does the pre-processing
 
 If the Python engine wants to receive and return tokens - the prompt templating and tokenization is already done - run it like this:
@@ -250,6 +300,8 @@ async def generate(request):
     yield {"token_ids":[13]}
 ```
 
+`pytok` supports the same ways of passing command line arguments as `pystr` - `initialize` or `main` with `sys.argv`.
+
 ## trtllm
 
 TensorRT-LLM. Requires `clang` and `libclang-dev`.
diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
index dcf7863d8b..6a8fd18b7e 100644
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -93,8 +93,7 @@ pub struct Flags {
 
     /// Internal use only.
     // Start the python vllm engine sub-process.
-    #[arg(long)]
-    #[clap(hide = true, default_value = "false")]
+    #[arg(long, hide = true, default_value = "false")]
     pub internal_vllm_process: bool,
 
     /// Internal use only.
@@ -104,9 +103,52 @@ pub struct Flags {
     /// - the node rank (0 for first host, 1 for second host, etc)
     /// - the workers' rank (globally unique)
     /// - the GPU to use (locally unique)
-    #[arg(long)]
-    #[clap(hide = true, value_parser = parse_sglang_flags)]
+    #[arg(long, hide = true, value_parser = parse_sglang_flags)]
     pub internal_sglang_process: Option<SgLangFlags>,
+
+    /// Everything after a `--`.
+    /// These are the command line arguments to the python engine when using `pystr` or `pytok`.
+    #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
+    pub last: Vec<String>,
+}
+
+impl Flags {
+    /// Convert the flags back to a command line. Including only the non-null values, but
+    /// include the defaults. Includes the canonicalized model path and normalized model name.
+    ///
+    /// Used to pass arguments to python engines via `pystr` and `pytok`.
+    pub fn as_vec(&self, path: &str, name: &str) -> Vec<String> {
+        let mut out = vec![
+            "--model-path".to_string(),
+            path.to_string(),
+            "--model-name".to_string(),
+            name.to_string(),
+            "--http-port".to_string(),
+            self.http_port.to_string(),
+            // Default 1
+            "--tensor-parallel-size".to_string(),
+            self.tensor_parallel_size.to_string(),
+            // Default 0
+            "--base-gpu-id".to_string(),
+            self.base_gpu_id.to_string(),
+            // Default 1
+            "--num-nodes".to_string(),
+            self.num_nodes.to_string(),
+            // Default 0
+            "--node-rank".to_string(),
+            self.node_rank.to_string(),
+        ];
+        if let Some(model_config_path) = self.model_config.as_ref() {
+            out.push("--model-config".to_string());
+            out.push(model_config_path.display().to_string());
+        }
+        if let Some(leader) = self.leader_addr.as_ref() {
+            out.push("--leader-addr".to_string());
+            out.push(leader.to_string());
+        }
+        out.extend(self.last.clone());
+        out
+    }
 }
 
 #[derive(Debug, Clone, Copy)]
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 4270c3092b..9e7218ceb1 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -82,7 +82,8 @@ pub async fn run(
     // Turn relative paths into absolute paths
     let model_path = flags
         .model_path_pos
-        .or(flags.model_path_flag)
+        .clone()
+        .or(flags.model_path_flag.clone())
         .and_then(|p| {
             if p.exists() {
                 p.canonicalize().ok()
@@ -93,6 +94,7 @@ pub async fn run(
     // Serve the model under the name provided, or the name of the GGUF file or HF repo.
     let model_name = flags
         .model_name
+        .clone()
         .or_else(|| {
             model_path
                 .as_ref()
@@ -338,8 +340,9 @@ pub async fn run(
             let Some(model_name) = model_name else {
                 anyhow::bail!("Provide model service name as `--model-name <this>`");
             };
+            let py_args = flags.as_vec(&path_str, &model_name);
             let p = std::path::PathBuf::from(path_str);
-            let engine = python::make_string_engine(cancel_token.clone(), &p).await?;
+            let engine = python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
             EngineConfig::StaticFull {
                 service_name: model_name,
                 engine,
@@ -354,8 +357,9 @@ pub async fn run(
             let Some(model_name) = model_name else {
                 unreachable!("If we have a card we must have a model name");
             };
+            let py_args = flags.as_vec(&path_str, &model_name);
             let p = std::path::PathBuf::from(path_str);
-            let engine = python::make_token_engine(cancel_token.clone(), &p).await?;
+            let engine = python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
             EngineConfig::StaticCore {
                 service_name: model_name.clone(),
                 engine,
diff --git a/lib/llm/src/engines/python.rs b/lib/llm/src/engines/python.rs
index acc35647e5..16f6b61642 100644
--- a/lib/llm/src/engines/python.rs
+++ b/lib/llm/src/engines/python.rs
@@ -16,6 +16,7 @@
 use std::ffi::CStr;
 use std::{path::Path, sync::Arc};
 
+use anyhow::Context;
 use dynamo_runtime::pipeline::error as pipeline_error;
 pub use dynamo_runtime::{
     error,
@@ -43,12 +44,11 @@ const PY_IMPORT: &CStr = cr#"
 import importlib.util
 import sys
 
-module_name = file_path.split("/")[-1].replace(".py", "")
-spec = importlib.util.spec_from_file_location(module_name, file_path)
-
+spec = importlib.util.spec_from_file_location("__main__", file_path)
 module = importlib.util.module_from_spec(spec)
 
-sys.modules[module_name] = module
+sys.argv = sys_argv
+sys.modules["__main__"] = module
 spec.loader.exec_module(module)
 "#;
 
@@ -56,10 +56,11 @@ spec.loader.exec_module(module)
 pub async fn make_string_engine(
     cancel_token: CancellationToken,
     py_file: &Path,
+    py_args: Vec<String>,
 ) -> pipeline_error::Result<OpenAIChatCompletionsStreamingEngine> {
     pyo3::prepare_freethreaded_python();
 
-    let engine = new_engine(cancel_token, py_file).await?;
+    let engine = new_engine(cancel_token, py_file, py_args).await?;
     let engine: OpenAIChatCompletionsStreamingEngine = Arc::new(engine);
     Ok(engine)
 }
@@ -68,10 +69,11 @@ pub async fn make_string_engine(
 pub async fn make_token_engine(
     cancel_token: CancellationToken,
     py_file: &Path,
+    py_args: Vec<String>,
 ) -> pipeline_error::Result<ExecutionContext> {
     pyo3::prepare_freethreaded_python();
 
-    let engine = new_engine(cancel_token, py_file).await?;
+    let engine = new_engine(cancel_token, py_file, py_args).await?;
     let engine: ExecutionContext = Arc::new(engine);
     Ok(engine)
 }
@@ -86,13 +88,28 @@ pub struct PythonServerStreamingEngine {
 async fn new_engine(
     cancel_token: CancellationToken,
     py_file: &Path,
+    py_args: Vec<String>,
 ) -> anyhow::Result<PythonServerStreamingEngine> {
     let (tx, rx) = tokio::sync::oneshot::channel();
     tokio::task::spawn_blocking(move || run_asyncio(tx));
     let event_loop = rx.await?;
 
-    let user_module = python_file_to_module(py_file)?;
-    let generator = Python::with_gil(|py| user_module.getattr(py, "generate").unwrap());
+    let user_module = python_file_to_module(py_file, py_args.clone())
+        .with_context(|| py_file.display().to_string())?;
+    let generator = Python::with_gil(|py| {
+        if let Ok(initialize) = user_module.getattr(py, "initialize") {
+            initialize
+                .call1(py, (py_args,))
+                .inspect_err(|err| {
+                    println!();
+                    err.display(py);
+                })
+                .with_context(|| "Failed calling python engine's initialize(args)")?;
+        };
+        user_module
+            .getattr(py, "generate")
+            .with_context(|| "generate")
+    })?;
     Ok(PythonServerStreamingEngine::new(
         cancel_token,
         Arc::new(generator),
@@ -127,16 +144,25 @@ fn run_asyncio(tx: Sender<Arc<PyObject>>) {
     });
 }
 
-fn python_file_to_module(p: &Path) -> Result<PyObject> {
+fn python_file_to_module(p: &Path, mut py_args: Vec<String>) -> Result<PyObject> {
+    if let Some(filename) = p.file_name() {
+        py_args.insert(0, filename.to_string_lossy().to_string());
+    };
     let module: PyObject = Python::with_gil(|py| {
-        let globals = [("file_path", p.display().to_string())]
+        let py_file_path: PyObject = p.display().to_string().into_pyobject(py).unwrap().into();
+        let py_sys_argv: PyObject = py_args.into_pyobject(py).unwrap().into();
+        let globals = [("file_path", py_file_path), ("sys_argv", py_sys_argv)]
             .into_py_dict(py)
-            .unwrap();
+            .context("into_py_dict")?;
         let locals = PyDict::new(py);
-        py.run(PY_IMPORT, Some(&globals), Some(&locals)).unwrap();
-        let module = locals.get_item("module").unwrap().unwrap();
-        module.extract().unwrap()
-    });
+        py.run(PY_IMPORT, Some(&globals), Some(&locals))
+            .context("PY_IMPORT")?;
+        let module = locals
+            .get_item("module")
+            .unwrap()
+            .context("get module after import")?;
+        module.extract().context("extract")
+    })?;
     Ok(module)
 }
 

From 22235c0dab291b347452930a5c3c6451e78436b8 Mon Sep 17 00:00:00 2001
From: Graham King <grahamk@nvidia.com>
Date: Wed, 12 Mar 2025 15:28:34 -0400
Subject: [PATCH 2/2] Remove the `initialize` method.

Less is more.
---
 launch/dynamo-run/README.md   | 30 ++++--------------------------
 lib/llm/src/engines/python.rs |  6 ++++--
 2 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/launch/dynamo-run/README.md b/launch/dynamo-run/README.md
index efbdf996d8..4db701039b 100644
--- a/launch/dynamo-run/README.md
+++ b/launch/dynamo-run/README.md
@@ -216,17 +216,14 @@ Command line arguments are passed to the python engine like this:
 dynamo-run out=pystr:my_python_engine.py -- -n 42 --custom-arg Orange --yes
 ```
 
-The python engine has two options for receiving those command line arguments (both of which are optional).
-
-In both cases the argument list will include some standard ones as well as anything after the `--`.
+The python engine receives the arguments in `sys.argv`. The argument list will include some standard ones as well as anything after the `--`.
 
 This input:
 ```
 dynamo-run out=pystr:my_engine.py /opt/models/Llama-3.2-3B-Instruct/ --model-name llama_3.2 --tensor-parallel-size 4 -- -n 1
 ```
 
-1. In `sys.argv`:
-
+is read like this:
 ```
 async def generate(request):
     .. as before ..
@@ -235,31 +232,12 @@ if __name__ == "__main__":
     print(f"MAIN: {sys.argv}")
 ```
 
-Produces this output:
+and produces this output:
 ```
 MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
 ```
 
-This form allows quick iteration on the engine setup.
-
-2. In an `initialize` function:
-
-```
-async def generate(request):
-    .. as before ..
-
-def initialize(args: list[str]) -> None:
-    print(f"initialize: {args}")
-```
-
-Produces this output:
-```
-initialize: ['--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--model-name', 'llama3.2', '--http-port', '8080', '--tensor-parallel-size', '4', '--base-gpu-id', '0', '--num-nodes', '1', '--node-rank', '0', '-n', '1']
-```
-
-Note how in both cases the `-n` `1` is included.
-
-Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`.
+This allows quick iteration on the engine setup. Note how the `-n` `1` is included. Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`.
 
 ### Dynamo does the pre-processing
 
diff --git a/lib/llm/src/engines/python.rs b/lib/llm/src/engines/python.rs
index 16f6b61642..4d7df7d966 100644
--- a/lib/llm/src/engines/python.rs
+++ b/lib/llm/src/engines/python.rs
@@ -94,9 +94,10 @@ async fn new_engine(
     tokio::task::spawn_blocking(move || run_asyncio(tx));
     let event_loop = rx.await?;
 
-    let user_module = python_file_to_module(py_file, py_args.clone())
-        .with_context(|| py_file.display().to_string())?;
+    let user_module =
+        python_file_to_module(py_file, py_args).with_context(|| py_file.display().to_string())?;
     let generator = Python::with_gil(|py| {
+        /* Leave commented, `initialize` may be needed to match Triton
         if let Ok(initialize) = user_module.getattr(py, "initialize") {
             initialize
                 .call1(py, (py_args,))
@@ -106,6 +107,7 @@ async fn new_engine(
                 })
                 .with_context(|| "Failed calling python engine's initialize(args)")?;
         };
+        */
         user_module
             .getattr(py, "generate")
             .with_context(|| "generate")