Skip to content

Commit 2ee5ae9

Browse files
committed
Add support for llama-stack
Add new option --api which allows users to specify the API Server either llama-stack or none. With None, we just generate a service with serve command. With `--api llama-stack`, RamaLama will generate an API Server listening on port 8321 and a openai server listening on port 8080. Signed-off-by: Daniel J Walsh <[email protected]>
1 parent 5220499 commit 2ee5ae9

18 files changed

+346
-60
lines changed

docs/ramalama-run.1.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ URL support means if a model is on a web site or even on your local system, you
2626

2727
## OPTIONS
2828

29+
#### **--api**=**llama-stack** | none**
30+
unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
31+
The default can be overridden in the ramalama.conf file.
32+
2933
#### **--authfile**=*password*
3034
path of the authentication file for OCI registries
3135

docs/ramalama-serve.1.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ For REST API endpoint documentation, see: [https://github.com/ggml-org/llama.cpp
3535

3636
## OPTIONS
3737

38+
#### **--api**=**llama-stack** | none**
39+
unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
40+
The default can be overridden in the ramalama.conf file.
41+
3842
#### **--authfile**=*password*
3943
path of the authentication file for OCI registries
4044

docs/ramalama.conf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717

1818
[ramalama]
1919

20+
# unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
21+
# Options: llama-stack, none
22+
#
23+
# api = "none"
24+
2025
# OCI model car image
2126
# Image to use when building and pushing --type=car models
2227
#

docs/ramalama.conf.5.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ The ramalama table contains settings to configure and manage the OCI runtime.
6060

6161
`[[ramalama]]`
6262

63+
**api**="none"
64+
65+
Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
66+
Options: llama-stack, none
67+
6368
**carimage**="registry.access.redhat.com/ubi9-micro:latest"
6469

6570
OCI model car image

ramalama/cli.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@
2727
from ramalama.logger import configure_logger, logger
2828
from ramalama.migrate import ModelStoreImport
2929
from ramalama.model import MODEL_TYPES
30-
from ramalama.model_factory import ModelFactory
30+
from ramalama.model_factory import ModelFactory, New
3131
from ramalama.model_store import GlobalModelStore
3232
from ramalama.shortnames import Shortnames
33+
from ramalama.stack import Stack
3334
from ramalama.version import print_version, version
3435

3536
shortnames = Shortnames()
@@ -468,9 +469,7 @@ def get_size(path):
468469
def _list_models(args):
469470
mycwd = os.getcwd()
470471
if args.use_model_store:
471-
models = GlobalModelStore(args.store).list_models(
472-
engine=args.engine, debug=args.debug, show_container=args.container
473-
)
472+
models = GlobalModelStore(args.store).list_models(engine=args.engine, show_container=args.container)
474473
ret = []
475474
local_timezone = datetime.now().astimezone().tzinfo
476475

@@ -734,6 +733,13 @@ def push_cli(args):
734733

735734

736735
def runtime_options(parser, command):
736+
if command in ["run", "serve"]:
737+
parser.add_argument(
738+
"--api",
739+
default=CONFIG["api"],
740+
choices=["llama-stack", "none"],
741+
help="unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.",
742+
)
737743
parser.add_argument("--authfile", help="path of the authentication file")
738744
if command in ["run", "perplexity", "serve"]:
739745
parser.add_argument(
@@ -939,6 +945,13 @@ def serve_cli(args):
939945
if args.rag:
940946
_get_rag(args)
941947

948+
if args.api == "llama-stack":
949+
if not args.container:
950+
raise ValueError("ramalama serve --api llama-stack command cannot be run with the --nocontainer option.")
951+
952+
stack = Stack(args)
953+
return stack.serve()
954+
942955
try:
943956
model = New(args.MODEL, args)
944957
model.serve(args)
@@ -1081,17 +1094,11 @@ def rm_cli(args):
10811094
if len(args.MODEL) > 0:
10821095
raise IndexError("can not specify --all as well MODEL")
10831096

1084-
models = GlobalModelStore(args.store).list_models(
1085-
engine=args.engine, debug=args.debug, show_container=args.container
1086-
)
1097+
models = GlobalModelStore(args.store).list_models(engine=args.engine, show_container=args.container)
10871098

10881099
return _rm_model([model for model in models.keys()], args)
10891100

10901101

1091-
def New(model, args, transport=CONFIG["transport"]):
1092-
return ModelFactory(model, args, transport=transport).create()
1093-
1094-
10951102
def client_cli(args):
10961103
"""Handle client command execution"""
10971104
client_args = ["ramalama-client-core", "-c", "2048", "--temp", "0.8", args.HOST] + args.ARGS

ramalama/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def load_config_defaults(config: Dict[str, Any]):
8989
"MUSA_VISIBLE_DEVICES": "quay.io/ramalama/musa",
9090
},
9191
)
92+
config.setdefault('api', 'none')
9293
config.setdefault('keep_groups', False)
9394
config.setdefault('ngl', -1)
9495
config.setdefault('threads', -1)

ramalama/engine.py

Lines changed: 79 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ def __init__(self, args):
1717
"run",
1818
"--rm",
1919
]
20-
self.use_docker = os.path.basename(args.engine) == "docker"
21-
self.use_podman = os.path.basename(args.engine) == "podman"
20+
base = os.path.basename(args.engine)
21+
self.use_docker = base == "docker"
22+
self.use_podman = base == "podman"
2223
self.args = args
23-
self.add_container_labels()
24+
self.add_labels()
2425
self.add_device_options()
2526
self.add_env_option()
2627
self.add_network()
@@ -37,19 +38,11 @@ def __init__(self, args):
3738
def add_label(self, label):
3839
self.add(["--label", label])
3940

40-
def add_container_labels(self):
41-
label_map = {
42-
"MODEL": "ai.ramalama.model",
43-
"engine": "ai.ramalama.engine",
44-
"runtime": "ai.ramalama.runtime",
45-
"port": "ai.ramalama.port",
46-
"subcommand": "ai.ramalama.command",
47-
}
48-
for arg, label_prefix in label_map.items():
49-
if hasattr(self.args, arg):
50-
value = getattr(self.args, arg)
51-
if value:
52-
self.add_label(f"{label_prefix}={value}")
41+
def add_name(self, name):
42+
self.add(["--name", name])
43+
44+
def add_labels(self):
45+
add_labels(self.args, self.add_label)
5346

5447
def add_pull_newer(self):
5548
if not self.args.dryrun and self.use_docker and self.args.pull == "newer":
@@ -89,6 +82,9 @@ def add_privileged_options(self):
8982
"--security-opt=no-new-privileges",
9083
]
9184

85+
def cap_add(self, cap):
86+
self.exec_args += ["--cap-add", cap]
87+
9288
def add_subcommand_env(self):
9389
if EMOJI and hasattr(self.args, "subcommand") and self.args.subcommand == "run":
9490
if os.path.basename(self.args.engine) == "podman":
@@ -110,7 +106,12 @@ def add_detach_option(self):
110106
self.exec_args += ["-d"]
111107

112108
def add_port_option(self):
113-
if hasattr(self.args, "port"):
109+
if not hasattr(self.args, "port") or not self.args.port or self.args.port == "":
110+
return
111+
112+
if self.args.port.count(":") > 0:
113+
self.exec_args += ["-p", self.args.port]
114+
else:
114115
self.exec_args += ["-p", f"{self.args.port}:{self.args.port}"]
115116

116117
def add_device_options(self):
@@ -242,26 +243,79 @@ def info(args):
242243
return str(e)
243244

244245

245-
def stop_container(args, name):
246+
def inspect(args, name, format=None, ignore_stderr=False):
246247
if not name:
247248
raise ValueError("must specify a container name")
248249
conman = args.engine
249250
if conman == "":
250251
raise ValueError("no container manager (Podman, Docker) found")
251252

252-
conman_args = [conman, "stop", "-t=0"]
253-
ignore_stderr = False
254-
if args.ignore:
255-
if conman == "podman":
256-
conman_args += ["--ignore", str(args.ignore)]
257-
else:
258-
ignore_stderr = True
253+
conman_args = [conman, "inspect"]
254+
if format:
255+
conman_args += ["--format", format]
259256

260257
conman_args += [name]
258+
return run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug).stdout.decode("utf-8").strip()
259+
260+
261+
def stop_container(args, name):
262+
if not name:
263+
raise ValueError("must specify a container name")
264+
conman = args.engine
265+
if conman == "":
266+
raise ValueError("no container manager (Podman, Docker) found")
267+
268+
ignore_stderr = False
269+
pod = ""
270+
try:
271+
pod = inspect(args, name, format="{{ .Pod }}", ignore_stderr=True)
272+
except Exception: # Ignore errors, the stop command will handle it.
273+
pass
274+
275+
if pod != "":
276+
conman_args = [conman, "pod", "rm", "-t=0", "--ignore", "--force", pod]
277+
else:
278+
conman_args = [conman, "stop", "-t=0"]
279+
if args.ignore:
280+
if conman == "podman":
281+
conman_args += ["--ignore", str(args.ignore)]
282+
else:
283+
ignore_stderr = True
284+
285+
conman_args += [name]
261286
try:
262287
run_cmd(conman_args, ignore_stderr=ignore_stderr)
263288
except subprocess.CalledProcessError:
264289
if args.ignore and conman == "docker":
265290
return
266291
else:
267292
raise
293+
294+
295+
def container_connection(args, name, port):
296+
if not name:
297+
raise ValueError("must specify a container name")
298+
if not port:
299+
raise ValueError("must specify a port to check")
300+
301+
conman = args.engine
302+
if conman == "":
303+
raise ValueError("no container manager (Podman, Docker) found")
304+
305+
conman_args = [conman, "port", name, port]
306+
output = run_cmd(conman_args, debug=args.debug).stdout.decode("utf-8").strip()
307+
return "" if output == "" else output.split(">")[-1].strip()
308+
309+
310+
def add_labels(args, add_label):
311+
label_map = {
312+
"MODEL": "ai.ramalama.model",
313+
"engine": "ai.ramalama.engine",
314+
"runtime": "ai.ramalama.runtime",
315+
"port": "ai.ramalama.port",
316+
"subcommand": "ai.ramalama.command",
317+
}
318+
for arg, label_prefix in label_map.items():
319+
if hasattr(args, arg):
320+
if value := getattr(args, arg):
321+
add_label(f"{label_prefix}={value}")

ramalama/model.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,6 @@ def add_rag(self, exec_args, args):
249249
def setup_container(self, args):
250250
name = self.get_container_name(args)
251251
self.base(args, name)
252-
self.engine.add_container_labels()
253252

254253
def gpu_args(self, args, runner=False):
255254
gpu_args = []
@@ -293,6 +292,9 @@ def exec_model_in_container(self, model_path, cmd_args, args):
293292
self.setup_mounts(model_path, args)
294293
self.handle_rag_mode(args, cmd_args)
295294

295+
# Make sure Image precedes cmd_args
296+
self.engine.add([accel_image(CONFIG, args)] + cmd_args)
297+
296298
if args.dryrun:
297299
self.engine.dryrun()
298300
return True
@@ -340,9 +342,6 @@ def handle_rag_mode(self, args, cmd_args):
340342
if hasattr(args, "rag") and args.rag:
341343
args.image = args.image.split(":")[0]
342344

343-
# Make sure Image precedes cmd_args
344-
self.engine.add([accel_image(CONFIG, args)] + cmd_args)
345-
346345
def bench(self, args):
347346
model_path = self.get_model_path(args)
348347
exec_args = self.build_exec_args_bench(args, model_path)
@@ -624,6 +623,7 @@ def serve(self, args, quiet=False):
624623
else:
625624
mnt_file = MNT_FILE
626625

626+
args.port = compute_serving_port(args, quiet=quiet or args.generate)
627627
exec_model_path = mnt_file if args.container or args.generate else model_path
628628
chat_template_path = ""
629629
mmproj_path = ""
@@ -730,16 +730,20 @@ def get_available_port_if_any() -> int:
730730
return chosen_port
731731

732732

733-
def compute_serving_port(port: str, quiet=False) -> str:
733+
def compute_serving_port(args, quiet=False) -> str:
734734
# user probably specified a custom port, don't override the choice
735-
if port != "" and port != str(DEFAULT_PORT):
736-
return port
737-
738-
# otherwise compute a random serving port in the range
739-
target_port = get_available_port_if_any()
735+
if args.port not in ["", str(DEFAULT_PORT)]:
736+
target_port = args.port
737+
else:
738+
# otherwise compute a random serving port in the range
739+
target_port = get_available_port_if_any()
740740

741741
if target_port == 0:
742742
raise IOError("no available port could be detected. Please ensure you have enough free ports.")
743743
if not quiet:
744-
print(f"serving on port {target_port}")
744+
openai = f"http://localhost:{target_port}"
745+
if args.api == "llama-stack":
746+
print(f"LlamaStack RESTAPI: {openai}")
747+
openai = openai + "/v1/openai"
748+
print(f"OpenAI RESTAPI: {openai}")
745749
return str(target_port)

ramalama/model_factory.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from urllib.parse import urlparse
66

77
from ramalama.common import rm_until_substring
8+
from ramalama.config import CONFIG
89
from ramalama.huggingface import Huggingface
910
from ramalama.model import MODEL_TYPES, SPLIT_MODEL_RE, is_split_file_model
1011
from ramalama.model_store import GlobalModelStore, ModelStore
@@ -148,3 +149,20 @@ def create_url(self) -> URL:
148149
model.split_model = self.split_model
149150
model.mnt_path = self.mnt_path
150151
return model
152+
153+
154+
def New(name, args, transport=CONFIG["transport"]):
155+
return ModelFactory(name, args, transport=transport).create()
156+
157+
158+
def Serve(name, args):
159+
model = New(name, args)
160+
try:
161+
model.serve(args)
162+
except KeyError as e:
163+
try:
164+
args.quiet = True
165+
model = ModelFactory(name, args, ignore_stderr=True).create_oci()
166+
model.serve(args)
167+
except Exception:
168+
raise e

ramalama/model_store.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def __init__(
203203
def path(self) -> str:
204204
return self._store_base_path
205205

206-
def list_models(self, engine: str, debug: bool, show_container: bool) -> Dict[str, List[ModelFile]]:
206+
def list_models(self, engine: str, show_container: bool) -> Dict[str, List[ModelFile]]:
207207
models: Dict[str, List[ModelFile]] = {}
208208

209209
for root, subdirs, _ in os.walk(self.path):
@@ -247,7 +247,6 @@ def list_models(self, engine: str, debug: bool, show_container: bool) -> Dict[st
247247
dotdict(
248248
{
249249
"engine": engine,
250-
"debug": debug,
251250
}
252251
)
253252
)

0 commit comments

Comments
 (0)