containers
diff --git a/‎docs/ramalama-run.1.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/ramalama-run.1.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/ramalama-serve.1.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/ramalama-serve.1.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/ramalama.conf‎
Lines changed: 5 additions & 0 deletions b/‎docs/ramalama.conf‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/ramalama.conf.5.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/ramalama.conf.5.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎libexec/ramalama/ramalama-client-core‎
Lines changed: 10 additions & 7 deletions b/‎libexec/ramalama/ramalama-client-core‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎ramalama/cli.py‎
Lines changed: 16 additions & 5 deletions b/‎ramalama/cli.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎ramalama/config.py‎
Lines changed: 1 addition & 0 deletions b/‎ramalama/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ramalama/engine.py‎
Lines changed: 79 additions & 25 deletions b/‎ramalama/engine.py‎
Lines changed: 79 additions & 25 deletions
diff --git a/‎ramalama/model.py‎
Lines changed: 15 additions & 12 deletions b/‎ramalama/model.py‎
Lines changed: 15 additions & 12 deletions
@@ -26,6 +26,10 @@ URL support means if a model is on a web site or even on your local system, you
 
 ## OPTIONS
 
+#### **--api**=**llama-stack** | none**
+unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
+The default can be overridden in the ramalama.conf file.
+
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
 
@@ -35,6 +35,10 @@ For REST API endpoint documentation, see: [https://github.com/ggml-org/llama.cpp
 
 ## OPTIONS
 
+#### **--api**=**llama-stack** | none**
+unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
+The default can be overridden in the ramalama.conf file.
+
 #### **--authfile**=*password*
 path of the authentication file for OCI registries
 
 
@@ -17,6 +17,11 @@
 
 [ramalama]
 
+# unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+# Options: llama-stack, none
+#
+# api = "none"
+
 # OCI model car image
 # Image to use when building and pushing --type=car models
 #
 
@@ -60,6 +60,11 @@ The ramalama table contains settings to configure and manage the OCI runtime.
 
 `[[ramalama]]`
 
+**api**="none"
+
+Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+Options: llama-stack, none
+
 **carimage**="registry.access.redhat.com/ubi9-micro:latest"
 
 OCI model car image
 
@@ -11,8 +11,6 @@ import time
 import urllib.error
 import urllib.request
 
-from ramalama.common import perror
-
 
 def should_colorize():
     t = os.getenv("TERM")
@@ -59,9 +57,14 @@ class RamaLamaShell(cmd.Cmd):
 
         self.url = f"{parsed_args.host}/v1/chat/completions"
         self.models_url = f"{parsed_args.host}/v1/models"
-        self.models = self.get_models()
-
-    def get_models(self):
+        self.models = []
+
+    def model(self):
+        if len(self.models) == 0:
+            self.models=self._models()
+        return self.models[0]
+    
+    def _models(self):
         request = urllib.request.Request(self.models_url, method="GET")
         response = urllib.request.urlopen(request)
         for line in response:
@@ -91,7 +94,7 @@ class RamaLamaShell(cmd.Cmd):
         data = {
             "stream": True,
             "messages": self.conversation_history,
-            "model": self.models[0],
+            "model": self.model(),
         }
 
         return data
@@ -126,7 +129,7 @@ class RamaLamaShell(cmd.Cmd):
         if response:
             return res(response, self.parsed_args.color)
 
-        perror(f"\rError: could not connect to: {self.url}")
+        print(f"\rError: could not connect to: {self.url}", file=sys.stderr)
         self.kills(self.parsed_args)
 
         return None
 
@@ -26,9 +26,10 @@
 from ramalama.config import CONFIG
 from ramalama.migrate import ModelStoreImport
 from ramalama.model import MODEL_TYPES
-from ramalama.model_factory import ModelFactory
+from ramalama.model_factory import ModelFactory, New
 from ramalama.model_store import GlobalModelStore
 from ramalama.shortnames import Shortnames
+from ramalama.stack import Stack
 from ramalama.version import print_version, version
 
 shortnames = Shortnames()
@@ -732,6 +733,13 @@ def push_cli(args):
 
 
 def runtime_options(parser, command):
+    if command in ["run", "serve"]:
+        parser.add_argument(
+            "--api",
+            default=CONFIG["api"],
+            choices=["llama-stack", "none"],
+            help="unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.",
+        )
     parser.add_argument("--authfile", help="path of the authentication file")
     if command in ["run", "perplexity", "serve"]:
         parser.add_argument(
@@ -937,6 +945,13 @@ def serve_cli(args):
     if args.rag:
         _get_rag(args)
 
+    if args.api == "llama-stack":
+        if not args.container:
+            raise ValueError("ramalama serve --api llama-stack command cannot be run with the --nocontainer option.")
+
+        stack = Stack(args)
+        return stack.serve()
+
     try:
         model = New(args.MODEL, args)
         model.serve(args)
@@ -1086,10 +1101,6 @@ def rm_cli(args):
     return _rm_model([model for model in models.keys()], args)
 
 
-def New(model, args, transport=CONFIG["transport"]):
-    return ModelFactory(model, args, transport=transport).create()
-
-
 def client_cli(args):
     """Handle client command execution"""
     client_args = ["ramalama-client-core", "-c", "2048", "--temp", "0.8", args.HOST] + args.ARGS
 
@@ -89,6 +89,7 @@ def load_config_defaults(config: Dict[str, Any]):
             "MUSA_VISIBLE_DEVICES": "quay.io/ramalama/musa",
         },
     )
+    config.setdefault('api', 'none')
     config.setdefault('keep_groups', False)
     config.setdefault('ngl', -1)
     config.setdefault('threads', -1)
 
@@ -17,10 +17,11 @@ def __init__(self, args):
             "run",
             "--rm",
         ]
-        self.use_docker = os.path.basename(args.engine) == "docker"
-        self.use_podman = os.path.basename(args.engine) == "podman"
+        base = os.path.basename(args.engine)
+        self.use_docker = base == "docker"
+        self.use_podman = base == "podman"
         self.args = args
-        self.add_container_labels()
+        self.add_labels()
         self.add_device_options()
         self.add_env_option()
         self.add_network()
@@ -38,19 +39,11 @@ def __init__(self, args):
     def add_label(self, label):
         self.add(["--label", label])
 
-    def add_container_labels(self):
-        label_map = {
-            "MODEL": "ai.ramalama.model",
-            "engine": "ai.ramalama.engine",
-            "runtime": "ai.ramalama.runtime",
-            "port": "ai.ramalama.port",
-            "subcommand": "ai.ramalama.command",
-        }
-        for arg, label_prefix in label_map.items():
-            if hasattr(self.args, arg):
-                value = getattr(self.args, arg)
-                if value:
-                    self.add_label(f"{label_prefix}={value}")
+    def add_name(self, name):
+        self.add(["--name", name])
+
+    def add_labels(self):
+        add_labels(self.args, self.add_label)
 
     def add_pull_newer(self):
         if not self.args.dryrun and self.use_docker and self.args.pull == "newer":
@@ -90,6 +83,9 @@ def add_privileged_options(self):
                     "--security-opt=no-new-privileges",
                 ]
 
+    def cap_add(self, cap):
+        self.exec_args += ["--cap-add", cap]
+
     def add_subcommand_env(self):
         if EMOJI and hasattr(self.args, "subcommand") and self.args.subcommand == "run":
             if os.path.basename(self.args.engine) == "podman":
@@ -111,7 +107,12 @@ def add_detach_option(self):
             self.exec_args += ["-d"]
 
     def add_port_option(self):
-        if hasattr(self.args, "port"):
+        if not hasattr(self.args, "port") or not self.args.port or self.args.port == "":
+            return
+
+        if self.args.port.count(":") > 0:
+            self.exec_args += ["-p", self.args.port]
+        else:
             self.exec_args += ["-p", f"{self.args.port}:{self.args.port}"]
 
     def add_device_options(self):
@@ -243,26 +244,79 @@ def info(args):
         return str(e)
 
 
-def stop_container(args, name):
+def inspect(args, name, format=None, ignore_stderr=False):
     if not name:
         raise ValueError("must specify a container name")
     conman = args.engine
     if conman == "":
         raise ValueError("no container manager (Podman, Docker) found")
 
-    conman_args = [conman, "stop", "-t=0"]
-    ignore_stderr = False
-    if args.ignore:
-        if conman == "podman":
-            conman_args += ["--ignore", str(args.ignore)]
-        else:
-            ignore_stderr = True
+    conman_args = [conman, "inspect"]
+    if format:
+        conman_args += ["--format", format]
 
     conman_args += [name]
+    return run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug).stdout.decode("utf-8").strip()
+
+
+def stop_container(args, name):
+    if not name:
+        raise ValueError("must specify a container name")
+    conman = args.engine
+    if conman == "":
+        raise ValueError("no container manager (Podman, Docker) found")
+
+    ignore_stderr = False
+    pod = ""
+    try:
+        pod = inspect(args, name, format="{{ .Pod }}", ignore_stderr=True)
+    except Exception:  # Ignore errors, the stop command will handle it.
+        pass
+
+    if pod != "":
+        conman_args = [conman, "pod", "rm", "-t=0", "--ignore", "--force", pod]
+    else:
+        conman_args = [conman, "stop", "-t=0"]
+        if args.ignore:
+            if conman == "podman":
+                conman_args += ["--ignore", str(args.ignore)]
+            else:
+                ignore_stderr = True
+
+        conman_args += [name]
     try:
         run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug)
     except subprocess.CalledProcessError:
         if args.ignore and conman == "docker":
             return
         else:
             raise
+
+
+def container_connection(args, name, port):
+    if not name:
+        raise ValueError("must specify a container name")
+    if not port:
+        raise ValueError("must specify a port to check")
+
+    conman = args.engine
+    if conman == "":
+        raise ValueError("no container manager (Podman, Docker) found")
+
+    conman_args = [conman, "port", name, port]
+    output = run_cmd(conman_args, debug=args.debug).stdout.decode("utf-8").strip()
+    return "" if output == "" else output.split(">")[-1].strip()
+
+
+def add_labels(args, add_label):
+    label_map = {
+        "MODEL": "ai.ramalama.model",
+        "engine": "ai.ramalama.engine",
+        "runtime": "ai.ramalama.runtime",
+        "port": "ai.ramalama.port",
+        "subcommand": "ai.ramalama.command",
+    }
+    for arg, label_prefix in label_map.items():
+        if hasattr(args, arg):
+            if value := getattr(args, arg):
+                add_label(f"{label_prefix}={value}")
@@ -248,7 +248,6 @@ def add_rag(self, exec_args, args):
     def setup_container(self, args):
         name = self.get_container_name(args)
         self.base(args, name)
-        self.engine.add_container_labels()
 
     def gpu_args(self, args, runner=False):
         gpu_args = []
@@ -292,6 +291,9 @@ def exec_model_in_container(self, model_path, cmd_args, args):
         self.setup_mounts(model_path, args)
         self.handle_rag_mode(args, cmd_args)
 
+        # Make sure Image precedes cmd_args
+        self.engine.add([accel_image(CONFIG, args)] + cmd_args)
+
         if args.dryrun:
             self.engine.dryrun()
             return True
@@ -339,9 +341,6 @@ def handle_rag_mode(self, args, cmd_args):
         if hasattr(args, "rag") and args.rag:
             args.image = args.image.split(":")[0]
 
-        # Make sure Image precedes cmd_args
-        self.engine.add([accel_image(CONFIG, args)] + cmd_args)
-
     def bench(self, args):
         model_path = self.get_model_path(args)
         exec_args = self.build_exec_args_bench(args, model_path)
@@ -616,13 +615,13 @@ def execute_command(self, model_path, exec_args, args):
 
     def serve(self, args, quiet=False):
         self.validate_args(args)
-        args.port = compute_serving_port(args.port, args.debug, quiet)
         model_path = self.get_model_path(args)
         if is_split_file_model(model_path):
             mnt_file = MNT_DIR + '/' + self.mnt_path
         else:
             mnt_file = MNT_FILE
 
+        args.port = compute_serving_port(args, quiet=quiet or args.generate)
         exec_model_path = mnt_file if args.container or args.generate else model_path
         chat_template_path = ""
         mmproj_path = ""
@@ -730,16 +729,20 @@ def get_available_port_if_any(debug: bool) -> int:
         return chosen_port
 
 
-def compute_serving_port(port: str, debug: bool, quiet=False) -> str:
+def compute_serving_port(args, quiet=False) -> str:
     # user probably specified a custom port, don't override the choice
-    if port != "" and port != str(DEFAULT_PORT):
-        return port
-
-    # otherwise compute a random serving port in the range
-    target_port = get_available_port_if_any(debug)
+    if args.port not in ["", str(DEFAULT_PORT)]:
+        target_port = args.port
+    else:
+        # otherwise compute a random serving port in the range
+        target_port = get_available_port_if_any(args.debug)
 
     if target_port == 0:
         raise IOError("no available port could be detected. Please ensure you have enough free ports.")
     if not quiet:
-        print(f"serving on port {target_port}")
+        openai = f"http://localhost:{target_port}"
+        if args.api == "llama-stack":
+            print(f"LamaStack RESTAPI: {openai}")
+            openai = openai + "/v1/openai"
+        print(f"OpenAI RESTAPI: {openai}")
     return str(target_port)
Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,7 @@ def load_config_defaults(config: Dict[str, Any]):`
`89`	`89`	`"MUSA_VISIBLE_DEVICES": "quay.io/ramalama/musa",`
`90`	`90`	`},`
`91`	`91`	`)`
	`92`	`+ config.setdefault('api', 'none')`
`92`	`93`	`config.setdefault('keep_groups', False)`
`93`	`94`	`config.setdefault('ngl', -1)`
`94`	`95`	`config.setdefault('threads', -1)`