diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md index 1429397cd..b768594b2 100644 --- a/docs/ramalama-run.1.md +++ b/docs/ramalama-run.1.md @@ -26,6 +26,10 @@ URL support means if a model is on a web site or even on your local system, you ## OPTIONS +#### **--api**=**llama-stack** | none** +unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none) +The default can be overridden in the ramalama.conf file. + #### **--authfile**=*password* path of the authentication file for OCI registries diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index c5c7ab42c..34f485409 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -35,6 +35,10 @@ For REST API endpoint documentation, see: [https://github.com/ggml-org/llama.cpp ## OPTIONS +#### **--api**=**llama-stack** | none** +unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none) +The default can be overridden in the ramalama.conf file. + #### **--authfile**=*password* path of the authentication file for OCI registries diff --git a/docs/ramalama.conf b/docs/ramalama.conf index 45163e809..6198ac9a4 100644 --- a/docs/ramalama.conf +++ b/docs/ramalama.conf @@ -17,6 +17,11 @@ [ramalama] +# unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry. +# Options: llama-stack, none +# +# api = "none" + # OCI model car image # Image to use when building and pushing --type=car models # diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md index 0c657671d..8230f4281 100644 --- a/docs/ramalama.conf.5.md +++ b/docs/ramalama.conf.5.md @@ -60,6 +60,11 @@ The ramalama table contains settings to configure and manage the OCI runtime. `[[ramalama]]` +**api**="none" + +Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry. +Options: llama-stack, none + **carimage**="registry.access.redhat.com/ubi9-micro:latest" OCI model car image diff --git a/ramalama/cli.py b/ramalama/cli.py index dbf912ed5..db9eaca18 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -27,9 +27,10 @@ from ramalama.logger import configure_logger, logger from ramalama.migrate import ModelStoreImport from ramalama.model import MODEL_TYPES -from ramalama.model_factory import ModelFactory +from ramalama.model_factory import ModelFactory, New from ramalama.model_store import GlobalModelStore from ramalama.shortnames import Shortnames +from ramalama.stack import Stack from ramalama.version import print_version, version shortnames = Shortnames() @@ -468,9 +469,7 @@ def get_size(path): def _list_models(args): mycwd = os.getcwd() if args.use_model_store: - models = GlobalModelStore(args.store).list_models( - engine=args.engine, debug=args.debug, show_container=args.container - ) + models = GlobalModelStore(args.store).list_models(engine=args.engine, show_container=args.container) ret = [] local_timezone = datetime.now().astimezone().tzinfo @@ -734,6 +733,13 @@ def push_cli(args): def runtime_options(parser, command): + if command in ["run", "serve"]: + parser.add_argument( + "--api", + default=CONFIG["api"], + choices=["llama-stack", "none"], + help="unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.", + ) parser.add_argument("--authfile", help="path of the authentication file") if command in ["run", "perplexity", "serve"]: parser.add_argument( @@ -939,6 +945,13 @@ def serve_cli(args): if args.rag: _get_rag(args) + if args.api == "llama-stack": + if not args.container: + raise ValueError("ramalama serve --api llama-stack command cannot be run with the --nocontainer option.") + + stack = Stack(args) + return stack.serve() + try: model = New(args.MODEL, args) model.serve(args) @@ -1081,17 +1094,11 @@ def rm_cli(args): if len(args.MODEL) > 0: raise IndexError("can not specify --all as well MODEL") - models = GlobalModelStore(args.store).list_models( - engine=args.engine, debug=args.debug, show_container=args.container - ) + models = GlobalModelStore(args.store).list_models(engine=args.engine, show_container=args.container) return _rm_model([model for model in models.keys()], args) -def New(model, args, transport=CONFIG["transport"]): - return ModelFactory(model, args, transport=transport).create() - - def client_cli(args): """Handle client command execution""" client_args = ["ramalama-client-core", "-c", "2048", "--temp", "0.8", args.HOST] + args.ARGS diff --git a/ramalama/config.py b/ramalama/config.py index f54588cff..0b0402f0e 100644 --- a/ramalama/config.py +++ b/ramalama/config.py @@ -89,6 +89,7 @@ def load_config_defaults(config: Dict[str, Any]): "MUSA_VISIBLE_DEVICES": "quay.io/ramalama/musa", }, ) + config.setdefault('api', 'none') config.setdefault('keep_groups', False) config.setdefault('ngl', -1) config.setdefault('threads', -1) diff --git a/ramalama/engine.py b/ramalama/engine.py index b8be2a408..890d52756 100644 --- a/ramalama/engine.py +++ b/ramalama/engine.py @@ -17,10 +17,11 @@ def __init__(self, args): "run", "--rm", ] - self.use_docker = os.path.basename(args.engine) == "docker" - self.use_podman = os.path.basename(args.engine) == "podman" + base = os.path.basename(args.engine) + self.use_docker = base == "docker" + self.use_podman = base == "podman" self.args = args - self.add_container_labels() + self.add_labels() self.add_device_options() self.add_env_option() self.add_network() @@ -37,19 +38,11 @@ def __init__(self, args): def add_label(self, label): self.add(["--label", label]) - def add_container_labels(self): - label_map = { - "MODEL": "ai.ramalama.model", - "engine": "ai.ramalama.engine", - "runtime": "ai.ramalama.runtime", - "port": "ai.ramalama.port", - "subcommand": "ai.ramalama.command", - } - for arg, label_prefix in label_map.items(): - if hasattr(self.args, arg): - value = getattr(self.args, arg) - if value: - self.add_label(f"{label_prefix}={value}") + def add_name(self, name): + self.add(["--name", name]) + + def add_labels(self): + add_labels(self.args, self.add_label) def add_pull_newer(self): if not self.args.dryrun and self.use_docker and self.args.pull == "newer": @@ -89,6 +82,9 @@ def add_privileged_options(self): "--security-opt=no-new-privileges", ] + def cap_add(self, cap): + self.exec_args += ["--cap-add", cap] + def add_subcommand_env(self): if EMOJI and hasattr(self.args, "subcommand") and self.args.subcommand == "run": if os.path.basename(self.args.engine) == "podman": @@ -110,7 +106,12 @@ def add_detach_option(self): self.exec_args += ["-d"] def add_port_option(self): - if hasattr(self.args, "port"): + if not hasattr(self.args, "port") or not self.args.port or self.args.port == "": + return + + if self.args.port.count(":") > 0: + self.exec_args += ["-p", self.args.port] + else: self.exec_args += ["-p", f"{self.args.port}:{self.args.port}"] def add_device_options(self): @@ -242,22 +243,46 @@ def info(args): return str(e) -def stop_container(args, name): +def inspect(args, name, format=None, ignore_stderr=False): if not name: raise ValueError("must specify a container name") conman = args.engine if conman == "": raise ValueError("no container manager (Podman, Docker) found") - conman_args = [conman, "stop", "-t=0"] - ignore_stderr = False - if args.ignore: - if conman == "podman": - conman_args += ["--ignore", str(args.ignore)] - else: - ignore_stderr = True + conman_args = [conman, "inspect"] + if format: + conman_args += ["--format", format] conman_args += [name] + return run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug).stdout.decode("utf-8").strip() + + +def stop_container(args, name): + if not name: + raise ValueError("must specify a container name") + conman = args.engine + if conman == "": + raise ValueError("no container manager (Podman, Docker) found") + + ignore_stderr = False + pod = "" + try: + pod = inspect(args, name, format="{{ .Pod }}", ignore_stderr=True) + except Exception: # Ignore errors, the stop command will handle it. + pass + + if pod != "": + conman_args = [conman, "pod", "rm", "-t=0", "--ignore", "--force", pod] + else: + conman_args = [conman, "stop", "-t=0"] + if args.ignore: + if conman == "podman": + conman_args += ["--ignore", str(args.ignore)] + else: + ignore_stderr = True + + conman_args += [name] try: run_cmd(conman_args, ignore_stderr=ignore_stderr) except subprocess.CalledProcessError: @@ -265,3 +290,32 @@ def stop_container(args, name): return else: raise + + +def container_connection(args, name, port): + if not name: + raise ValueError("must specify a container name") + if not port: + raise ValueError("must specify a port to check") + + conman = args.engine + if conman == "": + raise ValueError("no container manager (Podman, Docker) found") + + conman_args = [conman, "port", name, port] + output = run_cmd(conman_args, debug=args.debug).stdout.decode("utf-8").strip() + return "" if output == "" else output.split(">")[-1].strip() + + +def add_labels(args, add_label): + label_map = { + "MODEL": "ai.ramalama.model", + "engine": "ai.ramalama.engine", + "runtime": "ai.ramalama.runtime", + "port": "ai.ramalama.port", + "subcommand": "ai.ramalama.command", + } + for arg, label_prefix in label_map.items(): + if hasattr(args, arg): + if value := getattr(args, arg): + add_label(f"{label_prefix}={value}") diff --git a/ramalama/model.py b/ramalama/model.py index 31c3ab184..d9a8c3255 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -249,7 +249,6 @@ def add_rag(self, exec_args, args): def setup_container(self, args): name = self.get_container_name(args) self.base(args, name) - self.engine.add_container_labels() def gpu_args(self, args, runner=False): gpu_args = [] @@ -293,6 +292,9 @@ def exec_model_in_container(self, model_path, cmd_args, args): self.setup_mounts(model_path, args) self.handle_rag_mode(args, cmd_args) + # Make sure Image precedes cmd_args + self.engine.add([accel_image(CONFIG, args)] + cmd_args) + if args.dryrun: self.engine.dryrun() return True @@ -340,9 +342,6 @@ def handle_rag_mode(self, args, cmd_args): if hasattr(args, "rag") and args.rag: args.image = args.image.split(":")[0] - # Make sure Image precedes cmd_args - self.engine.add([accel_image(CONFIG, args)] + cmd_args) - def bench(self, args): model_path = self.get_model_path(args) exec_args = self.build_exec_args_bench(args, model_path) @@ -617,13 +616,13 @@ def execute_command(self, model_path, exec_args, args): def serve(self, args, quiet=False): self.validate_args(args) - args.port = compute_serving_port(args.port, quiet) model_path = self.get_model_path(args) if is_split_file_model(model_path): mnt_file = MNT_DIR + '/' + self.mnt_path else: mnt_file = MNT_FILE + args.port = compute_serving_port(args, quiet=quiet or args.generate) exec_model_path = mnt_file if args.container or args.generate else model_path chat_template_path = "" mmproj_path = "" @@ -730,16 +729,20 @@ def get_available_port_if_any() -> int: return chosen_port -def compute_serving_port(port: str, quiet=False) -> str: +def compute_serving_port(args, quiet=False) -> str: # user probably specified a custom port, don't override the choice - if port != "" and port != str(DEFAULT_PORT): - return port - - # otherwise compute a random serving port in the range - target_port = get_available_port_if_any() + if args.port not in ["", str(DEFAULT_PORT)]: + target_port = args.port + else: + # otherwise compute a random serving port in the range + target_port = get_available_port_if_any() if target_port == 0: raise IOError("no available port could be detected. Please ensure you have enough free ports.") if not quiet: - print(f"serving on port {target_port}") + openai = f"http://localhost:{target_port}" + if args.api == "llama-stack": + print(f"LlamaStack RESTAPI: {openai}") + openai = openai + "/v1/openai" + print(f"OpenAI RESTAPI: {openai}") return str(target_port) diff --git a/ramalama/model_factory.py b/ramalama/model_factory.py index cfb857024..172c574aa 100644 --- a/ramalama/model_factory.py +++ b/ramalama/model_factory.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse from ramalama.common import rm_until_substring +from ramalama.config import CONFIG from ramalama.huggingface import Huggingface from ramalama.model import MODEL_TYPES, SPLIT_MODEL_RE, is_split_file_model from ramalama.model_store import GlobalModelStore, ModelStore @@ -148,3 +149,20 @@ def create_url(self) -> URL: model.split_model = self.split_model model.mnt_path = self.mnt_path return model + + +def New(name, args, transport=CONFIG["transport"]): + return ModelFactory(name, args, transport=transport).create() + + +def Serve(name, args): + model = New(name, args) + try: + model.serve(args) + except KeyError as e: + try: + args.quiet = True + model = ModelFactory(name, args, ignore_stderr=True).create_oci() + model.serve(args) + except Exception: + raise e diff --git a/ramalama/model_store.py b/ramalama/model_store.py index 1cce54b56..e3ac5bde9 100644 --- a/ramalama/model_store.py +++ b/ramalama/model_store.py @@ -203,7 +203,7 @@ def __init__( def path(self) -> str: return self._store_base_path - def list_models(self, engine: str, debug: bool, show_container: bool) -> Dict[str, List[ModelFile]]: + def list_models(self, engine: str, show_container: bool) -> Dict[str, List[ModelFile]]: models: Dict[str, List[ModelFile]] = {} for root, subdirs, _ in os.walk(self.path): @@ -247,7 +247,6 @@ def list_models(self, engine: str, debug: bool, show_container: bool) -> Dict[st dotdict( { "engine": engine, - "debug": debug, } ) ) diff --git a/ramalama/oci.py b/ramalama/oci.py index 6ed63efc2..b98215c90 100644 --- a/ramalama/oci.py +++ b/ramalama/oci.py @@ -231,6 +231,7 @@ def build(self, source_model, args): # Open the file for writing. with open(containerfile.name, 'w') as c: c.write(content) + c.flush() build_cmd = [ self.conman, diff --git a/ramalama/rag.py b/ramalama/rag.py index b724ae6c7..9670b2189 100644 --- a/ramalama/rag.py +++ b/ramalama/rag.py @@ -34,7 +34,10 @@ def build(self, source, target, args): # Open the file for writing. with open(containerfile.name, 'w') as c: c.write(cfile) + c.flush() + logger.debug(f"\nContainerfile: {containerfile.name}\n{cfile}") + exec_args = [ args.engine, "build", diff --git a/ramalama/shortnames.py b/ramalama/shortnames.py index e73a617d1..f394db975 100644 --- a/ramalama/shortnames.py +++ b/ramalama/shortnames.py @@ -47,4 +47,5 @@ def create_shortname_file(self): c.write('[shortnames]\n') for shortname in self.shortnames: c.write('"%s"="%s"\n' % (shortname, self.shortnames.get(shortname))) + c.flush() return shortnamefile.name diff --git a/ramalama/stack.py b/ramalama/stack.py new file mode 100644 index 000000000..efcf2c910 --- /dev/null +++ b/ramalama/stack.py @@ -0,0 +1,183 @@ +import os +import tempfile + +from ramalama.common import ( + exec_cmd, + genname, + tagged_image, +) +from ramalama.engine import add_labels +from ramalama.model import compute_serving_port +from ramalama.model_factory import ModelFactory, New + + +class Stack: + """Stack class""" + + type = "Stack" + + def __init__(self, args): + self.args = args + self.name = args.name if hasattr(args, "name") and args.name else genname() + if os.path.basename(args.engine) != "podman": + raise ValueError("llama-stack requires use of the Podman container engine") + self.host = "127.0.0.1" + model = ModelFactory(args.MODEL, args) + self.model = model.prune_model_input() + model = New(args.MODEL, args) + self.model_type = model.type + self.model_path = model.get_model_path(args) + self.model_port = str(int(self.args.port) + 1) + self.stack_image = tagged_image("quay.io/ramalama/llama-stack") + self.labels = "" + + def add_label(self, label): + cleanlabel = label.replace("=", ": ", 1) + self.labels = f"{self.labels}\n {cleanlabel}" + + def generate(self): + add_labels(self.args, self.add_label) + volume_mounts = """ + - mountPath: /mnt/models/model.file + name: model + - mountPath: /dev/dri + name: dri""" + + if self.model_type == "OCI": + volume_mounts = """ + - mountPath: /mnt/models + subPath: /models + name: model + - mountPath: /dev/dri + name: dri""" + + volumes = f""" + - hostPath: + path: {self.model_path} + name: model + - hostPath: + path: /dev/dri + name: dri""" + + llama_cmd = [ + 'llama-server', + '--port', + self.model_port, + '--model', + '/mnt/models/model.file', + '--alias', + self.model, + '--ctx-size', + self.args.context, + '--temp', + self.args.temp, + '--jinja', + '--cache-reuse', + '256', + '-v', + '--threads', + self.args.threads, + '--host', + self.host, + ] + + security = """ + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - CAP_CHOWN + - CAP_FOWNER + - CAP_FSETID + - CAP_KILL + - CAP_NET_BIND_SERVICE + - CAP_SETFCAP + - CAP_SETGID + - CAP_SETPCAP + - CAP_SETUID + - CAP_SYS_CHROOT + add: + - CAP_DAC_OVERRIDE + seLinuxOptions: + type: spc_t""" + + self.stack_yaml = f""" +apiVersion: v1 +kind: Deployment +metadata: + name: {self.name} + labels: + app: {self.name} +spec: + replicas: 1 + selector: + matchLabels: + app: {self.name} + template: + metadata: + labels: + ai.ramalama: "" + app: {self.name}{self.labels} + spec: + containers: + - name: model-server + image: {self.args.image} + command: ["/usr/libexec/ramalama/ramalama-serve-core"] + args: {llama_cmd}\ + {security} + volumeMounts:{volume_mounts} + - name: llama-stack + image: {self.stack_image} + args: + - /bin/sh + - -c + - llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml + env: + - name: RAMALAMA_URL + value: http://127.0.0.1:{self.model_port} + - name: INFERENCE_MODEL + value: {self.model}\ + {security} + ports: + - containerPort: 8321 + hostPort: {self.args.port} + volumes:{volumes}""" + return self.stack_yaml + + def serve(self): + self.args.port = compute_serving_port(self.args, quiet=self.args.generate) + yaml = self.generate() + if self.args.dryrun: + print(yaml) + return + yaml_file = tempfile.NamedTemporaryFile(prefix='RamaLama_', delete=not self.args.debug) + with open(yaml_file.name, 'w') as c: + c.write(yaml) + c.flush() + + exec_args = [ + self.args.engine, + "kube", + "play", + "--replace", + ] + if not self.args.detach: + exec_args.append("--wait") + + exec_args.append(yaml_file.name) + exec_cmd(exec_args) + + def stop(self): + yaml_file = tempfile.NamedTemporaryFile(prefix='RamaLama_', delete=not self.args.debug) + with open(yaml_file.name, 'w') as c: + c.write(self.generate()) + c.flush() + + exec_args = [ + self.args.engine, + "kube", + "down", + yaml_file.name, + ] + + exec_cmd(exec_args) diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats index 5a435df05..7cec36290 100755 --- a/test/system/040-serve.bats +++ b/test/system/040-serve.bats @@ -11,7 +11,6 @@ verify_begin=".*run --rm" if is_container; then run_ramalama -q --dryrun serve ${model} - assert "$output" =~ "serving on port .*" is "$output" "${verify_begin}.*" "dryrun correct" is "$output" ".*--name ramalama_.*" "dryrun correct" is "$output" ".*${model}" "verify model name" @@ -83,7 +82,6 @@ verify_begin=".*run --rm" assert "$output" =~ "No closing quotation" "error for improperly quoted runtime arguments" run_ramalama 1 serve MODEL - assert "$output" =~ "serving on port .*" assert "$output" =~ "Error: Manifest for MODEL:latest was not found in the Ollama registry" } diff --git a/test/unit/test_config.py b/test/unit/test_config.py index c80c9fd3f..3a368a6f1 100644 --- a/test/unit/test_config.py +++ b/test/unit/test_config.py @@ -55,6 +55,7 @@ def test_load_config_from_env(env, config, expected): ( {}, { + "api": "none", "nocontainer": False, "carimage": "registry.access.redhat.com/ubi9-micro:latest", "container": True, @@ -86,12 +87,14 @@ def test_load_config_from_env(env, config, expected): ), ( { + "api": "llama-stack", "nocontainer": True, "images": { "HIP_VISIBLE_DEVICES": "quay.io/repo/rocm", }, }, { + "api": "llama-stack", "nocontainer": True, "carimage": "registry.access.redhat.com/ubi9-micro:latest", "container": True, diff --git a/test/unit/test_engine.py b/test/unit/test_engine.py index fef068403..d779444fa 100644 --- a/test/unit/test_engine.py +++ b/test/unit/test_engine.py @@ -2,7 +2,7 @@ from argparse import Namespace from unittest.mock import patch -from ramalama.engine import Engine, containers, dry_run, images, stop_container +from ramalama.engine import Engine, containers, dry_run, images class TestEngine(unittest.TestCase): @@ -85,12 +85,6 @@ def test_containers(self, mock_run_cmd): self.assertEqual(result, ["container1", "container2"]) mock_run_cmd.assert_called_once() - @patch('ramalama.engine.run_cmd') - def test_stop_container(self, mock_run_cmd): - args = Namespace(engine="podman", debug=False, ignore=False) - stop_container(args, "test-container") - mock_run_cmd.assert_called_with(["podman", "stop", "-t=0", "test-container"], ignore_stderr=False) - def test_dry_run(self): with patch('sys.stdout') as mock_stdout: dry_run(["podman", "run", "--rm", "test-image"]) diff --git a/test/unit/test_model.py b/test/unit/test_model.py index 2f3fa62ea..990a43dc7 100644 --- a/test/unit/test_model.py +++ b/test/unit/test_model.py @@ -1,4 +1,5 @@ import socket +from argparse import Namespace from unittest.mock import MagicMock, Mock, patch import pytest @@ -114,6 +115,7 @@ def test_extract_model_identifiers(model_input: str, expected_name: str, expecte def test_compute_serving_port( inputPort: str, expectedRandomizedResult: list, expectedRandomPortsAvl: list, expectedOutput: str, expectedErr ): + args = Namespace(port=inputPort, debug=False, api="") mock_socket = socket.socket mock_socket.bind = MagicMock(side_effect=expectedRandomPortsAvl) mock_compute_ports = Mock(return_value=expectedRandomizedResult) @@ -122,8 +124,8 @@ def test_compute_serving_port( with patch('socket.socket', mock_socket): if expectedErr: with pytest.raises(expectedErr): - outputPort = compute_serving_port(inputPort, False) + outputPort = compute_serving_port(args, False) assert outputPort == expectedOutput else: - outputPort = compute_serving_port(inputPort, False) + outputPort = compute_serving_port(args, False) assert outputPort == expectedOutput