containers · ericcurtin · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
@@ -64,6 +64,9 @@ Show this help message and exit
 #### **--keepalive**
 duration to keep a model loaded (e.g. 5m)
 
+#### **--max-model-len**
+Maximum model length for vLLM (default: 2048)
+
 #### **--name**, **-n**
 name of the container to run the Model in
 

@@ -87,6 +87,9 @@ show this help message and exit
 #### **--host**="0.0.0.0"
 IP address for llama.cpp to listen on.
 
+#### **--max-model-len**
+Maximum model length for vLLM (default: 2048)
+
 #### **--model-draft**
 
 

@@ -857,6 +857,13 @@ def runtime_options(parser, command):
         parser.add_argument(
             "--rag", help="RAG vector database or OCI Image to be served with the model", completer=local_models
         )
+        parser.add_argument(
+            "--max-model-len",
+            dest="vllm_max_model_len",
+            type=int,
+            help="Maximum model length for vLLM",
+            completer=suppressCompleter,
+        )
     if command in ["perplexity", "run", "serve"]:
         parser.add_argument(
             "--runtime-args",

@@ -290,7 +290,28 @@ def exec_model_in_container(self, model_path, cmd_args, args):
         return True
 
     def setup_mounts(self, model_path, args):
-        if model_path and os.path.exists(model_path):
+        if args.runtime == "vllm":
+            model_base = ""
+            if self.store and hasattr(self, 'model_tag'):
+                ref_file = self.store.get_ref_file(self.model_tag)
+                if ref_file and hasattr(ref_file, 'hash'):
+                    model_base = self.store.model_base_directory
+            if not model_base:
+                # Might be needed for file:// paths directly used with vLLM.
+                if model_path and os.path.exists(model_path):
+                    if os.path.isfile(model_path):
+                        model_base = os.path.dirname(model_path)
+                    elif os.path.isdir(model_path):
+                        model_base = model_path
+            if model_base:
+                self.engine.add([f"--mount=type=bind,src={model_base},destination={MNT_DIR},ro"])
+            else:
+                raise ValueError(
+                    f'Could not determine a valid host directory to mount for model {self.model}'
+                    + 'Ensure the model path is correct or the model store is properly configured.'
+                )
+
+        elif model_path and os.path.exists(model_path):
             if hasattr(self, 'split_model'):
                 self.engine.add([f"--mount=type=bind,src={model_path},destination={MNT_DIR}/{self.mnt_path},ro"])
 
@@ -531,9 +552,38 @@ def build_exec_args_serve(self, args, exec_model_path, chat_template_path="", mm
     def handle_runtime(self, args, exec_args, exec_model_path):
         set_accel_env_vars()
         if args.runtime == "vllm":
-            exec_model_path = os.path.dirname(exec_model_path)
-            # Left out "vllm", "serve" the image entrypoint already starts it
-            exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"]
+            container_model_path = ""
+            ref_file = None
+            if self.store:
+                ref_file = self.store.get_ref_file(self.model_tag)
+
+            if ref_file and ref_file.hash:
+                snapshot_dir_name = ref_file.hash
+                container_model_path = os.path.join(MNT_DIR, "snapshots", snapshot_dir_name)
+            else:
+                current_model_host_path = self.get_model_path(args)
+                if os.path.isdir(current_model_host_path):
+                    container_model_path = MNT_DIR
+                else:
+                    container_model_path = os.path.join(MNT_DIR, os.path.basename(current_model_host_path))
+
+            vllm_max_model_len = 2048
+            if args.vllm_max_model_len:
+                vllm_max_model_len = args.vllm_max_model_len
+
+            exec_args = [
+                "--port",
+                str(args.port),
+                "--model",
+                str(container_model_path),
+                "--max_model_len",
+                str(vllm_max_model_len),
+                "--served-model-name",
+                self.model_name,
+            ]
+
+            if hasattr(args, 'runtime_args') and args.runtime_args:
+                exec_args.extend(args.runtime_args)
         else:
             gpu_args = self.gpu_args(args=args)
             if gpu_args is not None:

@@ -250,6 +250,11 @@ verify_begin=".*run --rm"
 	   rm $name.image
 	fi
 
+    run_ramalama rm oci://${ociimage}
+    done
+    stop_registry
+    skip "vLLM cant serve GGUFs, needs tiny safetensor"
+
 	run_ramalama --runtime=vllm serve --authfile=$authfile --tls-verify=false --name=${name} --port 1234 --generate=kube oci://${ociimage}
 	is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"
 
@@ -265,10 +270,7 @@ verify_begin=".*run --rm"
 	is "$output" ".*reference: ${ociimage}" "AI image should be created"
 	is "$output" ".*pullPolicy: IfNotPresent" "pullPolicy should exist"
 
-	run_ramalama rm oci://${ociimage}
-	rm $name.yaml
-    done
-    stop_registry
+    rm $name.yaml
 }