Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/ramalama-run.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ URL support means if a model is on a web site or even on your local system, you

## OPTIONS

#### **--api**=**llama-stack** | none**
unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
The default can be overridden in the ramalama.conf file.

#### **--authfile**=*password*
path of the authentication file for OCI registries

Expand Down
4 changes: 4 additions & 0 deletions docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ For REST API endpoint documentation, see: [https://github.com/ggml-org/llama.cpp

## OPTIONS

#### **--api**=**llama-stack** | none**
unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
The default can be overridden in the ramalama.conf file.

#### **--authfile**=*password*
path of the authentication file for OCI registries

Expand Down
5 changes: 5 additions & 0 deletions docs/ramalama.conf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@

[ramalama]

# unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
# Options: llama-stack, none
#
# api = "none"

# OCI model car image
# Image to use when building and pushing --type=car models
#
Expand Down
5 changes: 5 additions & 0 deletions docs/ramalama.conf.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ The ramalama table contains settings to configure and manage the OCI runtime.

`[[ramalama]]`

**api**="none"

Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
Options: llama-stack, none

**carimage**="registry.access.redhat.com/ubi9-micro:latest"

OCI model car image
Expand Down
29 changes: 18 additions & 11 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@
from ramalama.logger import configure_logger, logger
from ramalama.migrate import ModelStoreImport
from ramalama.model import MODEL_TYPES
from ramalama.model_factory import ModelFactory
from ramalama.model_factory import ModelFactory, New
from ramalama.model_store import GlobalModelStore
from ramalama.shortnames import Shortnames
from ramalama.stack import Stack
from ramalama.version import print_version, version

shortnames = Shortnames()
Expand Down Expand Up @@ -468,9 +469,7 @@ def get_size(path):
def _list_models(args):
mycwd = os.getcwd()
if args.use_model_store:
models = GlobalModelStore(args.store).list_models(
engine=args.engine, debug=args.debug, show_container=args.container
)
models = GlobalModelStore(args.store).list_models(engine=args.engine, show_container=args.container)
ret = []
local_timezone = datetime.now().astimezone().tzinfo

Expand Down Expand Up @@ -734,6 +733,13 @@ def push_cli(args):


def runtime_options(parser, command):
if command in ["run", "serve"]:
parser.add_argument(
"--api",
default=CONFIG["api"],
choices=["llama-stack", "none"],
help="unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.",
)
parser.add_argument("--authfile", help="path of the authentication file")
if command in ["run", "perplexity", "serve"]:
parser.add_argument(
Expand Down Expand Up @@ -939,6 +945,13 @@ def serve_cli(args):
if args.rag:
_get_rag(args)

if args.api == "llama-stack":
if not args.container:
raise ValueError("ramalama serve --api llama-stack command cannot be run with the --nocontainer option.")

stack = Stack(args)
return stack.serve()

try:
model = New(args.MODEL, args)
model.serve(args)
Expand Down Expand Up @@ -1081,17 +1094,11 @@ def rm_cli(args):
if len(args.MODEL) > 0:
raise IndexError("can not specify --all as well MODEL")

models = GlobalModelStore(args.store).list_models(
engine=args.engine, debug=args.debug, show_container=args.container
)
models = GlobalModelStore(args.store).list_models(engine=args.engine, show_container=args.container)

return _rm_model([model for model in models.keys()], args)


def New(model, args, transport=CONFIG["transport"]):
return ModelFactory(model, args, transport=transport).create()


def client_cli(args):
"""Handle client command execution"""
client_args = ["ramalama-client-core", "-c", "2048", "--temp", "0.8", args.HOST] + args.ARGS
Expand Down
1 change: 1 addition & 0 deletions ramalama/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def load_config_defaults(config: Dict[str, Any]):
"MUSA_VISIBLE_DEVICES": "quay.io/ramalama/musa",
},
)
config.setdefault('api', 'none')
config.setdefault('keep_groups', False)
config.setdefault('ngl', -1)
config.setdefault('threads', -1)
Expand Down
104 changes: 79 additions & 25 deletions ramalama/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ def __init__(self, args):
"run",
"--rm",
]
self.use_docker = os.path.basename(args.engine) == "docker"
self.use_podman = os.path.basename(args.engine) == "podman"
base = os.path.basename(args.engine)
self.use_docker = base == "docker"
self.use_podman = base == "podman"
self.args = args
self.add_container_labels()
self.add_labels()
self.add_device_options()
self.add_env_option()
self.add_network()
Expand All @@ -37,19 +38,11 @@ def __init__(self, args):
def add_label(self, label):
self.add(["--label", label])

def add_container_labels(self):
label_map = {
"MODEL": "ai.ramalama.model",
"engine": "ai.ramalama.engine",
"runtime": "ai.ramalama.runtime",
"port": "ai.ramalama.port",
"subcommand": "ai.ramalama.command",
}
for arg, label_prefix in label_map.items():
if hasattr(self.args, arg):
value = getattr(self.args, arg)
if value:
self.add_label(f"{label_prefix}={value}")
def add_name(self, name):
self.add(["--name", name])

def add_labels(self):
add_labels(self.args, self.add_label)

def add_pull_newer(self):
if not self.args.dryrun and self.use_docker and self.args.pull == "newer":
Expand Down Expand Up @@ -89,6 +82,9 @@ def add_privileged_options(self):
"--security-opt=no-new-privileges",
]

def cap_add(self, cap):
self.exec_args += ["--cap-add", cap]

def add_subcommand_env(self):
if EMOJI and hasattr(self.args, "subcommand") and self.args.subcommand == "run":
if os.path.basename(self.args.engine) == "podman":
Expand All @@ -110,7 +106,12 @@ def add_detach_option(self):
self.exec_args += ["-d"]

def add_port_option(self):
if hasattr(self.args, "port"):
if not hasattr(self.args, "port") or not self.args.port or self.args.port == "":
return

if self.args.port.count(":") > 0:
self.exec_args += ["-p", self.args.port]
else:
self.exec_args += ["-p", f"{self.args.port}:{self.args.port}"]

def add_device_options(self):
Expand Down Expand Up @@ -242,26 +243,79 @@ def info(args):
return str(e)


def stop_container(args, name):
def inspect(args, name, format=None, ignore_stderr=False):
if not name:
raise ValueError("must specify a container name")
conman = args.engine
if conman == "":
raise ValueError("no container manager (Podman, Docker) found")

conman_args = [conman, "stop", "-t=0"]
ignore_stderr = False
if args.ignore:
if conman == "podman":
conman_args += ["--ignore", str(args.ignore)]
else:
ignore_stderr = True
conman_args = [conman, "inspect"]
if format:
conman_args += ["--format", format]

conman_args += [name]
return run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug).stdout.decode("utf-8").strip()


def stop_container(args, name):
if not name:
raise ValueError("must specify a container name")
conman = args.engine
if conman == "":
raise ValueError("no container manager (Podman, Docker) found")

ignore_stderr = False
pod = ""
try:
pod = inspect(args, name, format="{{ .Pod }}", ignore_stderr=True)
except Exception: # Ignore errors, the stop command will handle it.
pass

if pod != "":
conman_args = [conman, "pod", "rm", "-t=0", "--ignore", "--force", pod]
else:
conman_args = [conman, "stop", "-t=0"]
if args.ignore:
if conman == "podman":
conman_args += ["--ignore", str(args.ignore)]
else:
ignore_stderr = True

conman_args += [name]
try:
run_cmd(conman_args, ignore_stderr=ignore_stderr)
except subprocess.CalledProcessError:
if args.ignore and conman == "docker":
return
else:
raise


def container_connection(args, name, port):
if not name:
raise ValueError("must specify a container name")
if not port:
raise ValueError("must specify a port to check")

conman = args.engine
if conman == "":
raise ValueError("no container manager (Podman, Docker) found")

conman_args = [conman, "port", name, port]
output = run_cmd(conman_args, debug=args.debug).stdout.decode("utf-8").strip()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: Parsing docker port output by splitting on '>' is brittle

This breaks if Docker/Podman returns multiple mappings or uses a different format. Use a regex or docker port --format structured output for more reliable parsing.

return "" if output == "" else output.split(">")[-1].strip()


def add_labels(args, add_label):
label_map = {
"MODEL": "ai.ramalama.model",
"engine": "ai.ramalama.engine",
"runtime": "ai.ramalama.runtime",
"port": "ai.ramalama.port",
"subcommand": "ai.ramalama.command",
}
for arg, label_prefix in label_map.items():
if hasattr(args, arg):
if value := getattr(args, arg):
add_label(f"{label_prefix}={value}")
27 changes: 15 additions & 12 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ def add_rag(self, exec_args, args):
def setup_container(self, args):
name = self.get_container_name(args)
self.base(args, name)
self.engine.add_container_labels()

def gpu_args(self, args, runner=False):
gpu_args = []
Expand Down Expand Up @@ -293,6 +292,9 @@ def exec_model_in_container(self, model_path, cmd_args, args):
self.setup_mounts(model_path, args)
self.handle_rag_mode(args, cmd_args)

# Make sure Image precedes cmd_args
self.engine.add([accel_image(CONFIG, args)] + cmd_args)

if args.dryrun:
self.engine.dryrun()
return True
Expand Down Expand Up @@ -340,9 +342,6 @@ def handle_rag_mode(self, args, cmd_args):
if hasattr(args, "rag") and args.rag:
args.image = args.image.split(":")[0]

# Make sure Image precedes cmd_args
self.engine.add([accel_image(CONFIG, args)] + cmd_args)

def bench(self, args):
model_path = self.get_model_path(args)
exec_args = self.build_exec_args_bench(args, model_path)
Expand Down Expand Up @@ -617,13 +616,13 @@ def execute_command(self, model_path, exec_args, args):

def serve(self, args, quiet=False):
self.validate_args(args)
args.port = compute_serving_port(args.port, quiet)
model_path = self.get_model_path(args)
if is_split_file_model(model_path):
mnt_file = MNT_DIR + '/' + self.mnt_path
else:
mnt_file = MNT_FILE

args.port = compute_serving_port(args, quiet=quiet or args.generate)
exec_model_path = mnt_file if args.container or args.generate else model_path
chat_template_path = ""
mmproj_path = ""
Expand Down Expand Up @@ -730,16 +729,20 @@ def get_available_port_if_any() -> int:
return chosen_port


def compute_serving_port(port: str, quiet=False) -> str:
def compute_serving_port(args, quiet=False) -> str:
# user probably specified a custom port, don't override the choice
if port != "" and port != str(DEFAULT_PORT):
return port

# otherwise compute a random serving port in the range
target_port = get_available_port_if_any()
if args.port not in ["", str(DEFAULT_PORT)]:
target_port = args.port
else:
# otherwise compute a random serving port in the range
target_port = get_available_port_if_any()

if target_port == 0:
raise IOError("no available port could be detected. Please ensure you have enough free ports.")
if not quiet:
print(f"serving on port {target_port}")
openai = f"http://localhost:{target_port}"
if args.api == "llama-stack":
print(f"LlamaStack RESTAPI: {openai}")
openai = openai + "/v1/openai"
print(f"OpenAI RESTAPI: {openai}")
return str(target_port)
18 changes: 18 additions & 0 deletions ramalama/model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from urllib.parse import urlparse

from ramalama.common import rm_until_substring
from ramalama.config import CONFIG
from ramalama.huggingface import Huggingface
from ramalama.model import MODEL_TYPES, SPLIT_MODEL_RE, is_split_file_model
from ramalama.model_store import GlobalModelStore, ModelStore
Expand Down Expand Up @@ -148,3 +149,20 @@ def create_url(self) -> URL:
model.split_model = self.split_model
model.mnt_path = self.mnt_path
return model


def New(name, args, transport=CONFIG["transport"]):
return ModelFactory(name, args, transport=transport).create()


def Serve(name, args):
model = New(name, args)
try:
model.serve(args)
except KeyError as e:
try:
args.quiet = True
model = ModelFactory(name, args, ignore_stderr=True).create_oci()
model.serve(args)
except Exception:
raise e
Comment on lines +167 to +168
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (code-quality): Explicitly raise from a previous error (raise-from-previous-error)

Suggested change
except Exception:
raise e
except Exception as exc:
raise e from exc

Comment on lines +167 to +168
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (code-quality): Explicitly raise from a previous error (raise-from-previous-error)

Suggested change
except Exception:
raise e
except Exception as exc:
raise e from exc

3 changes: 1 addition & 2 deletions ramalama/model_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def __init__(
def path(self) -> str:
return self._store_base_path

def list_models(self, engine: str, debug: bool, show_container: bool) -> Dict[str, List[ModelFile]]:
def list_models(self, engine: str, show_container: bool) -> Dict[str, List[ModelFile]]:
models: Dict[str, List[ModelFile]] = {}

for root, subdirs, _ in os.walk(self.path):
Expand Down Expand Up @@ -247,7 +247,6 @@ def list_models(self, engine: str, debug: bool, show_container: bool) -> Dict[st
dotdict(
{
"engine": engine,
"debug": debug,
}
)
)
Expand Down
Loading
Loading