Skip to content

Commit 48fa9ed

Browse files
committed
Add support for llama-stack
Add new option --api which allows users to specify the API Server either llama-stack or none. With None, we just generate a service with serve command. With `--api llama-stack`, RamaLama will generate an API Server listening on port 8321 and a openai server listening on port 8080. Signed-off-by: Daniel J Walsh <[email protected]>
1 parent c87726d commit 48fa9ed

File tree

18 files changed

+352
-60
lines changed

18 files changed

+352
-60
lines changed

docs/ramalama-run.1.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ URL support means if a model is on a web site or even on your local system, you
2626

2727
## OPTIONS
2828

29+
#### **--api**=**llama-stack** | none**
30+
unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
31+
The default can be overridden in the ramalama.conf file.
32+
2933
#### **--authfile**=*password*
3034
path of the authentication file for OCI registries
3135

docs/ramalama-serve.1.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ For REST API endpoint documentation, see: [https://github.com/ggml-org/llama.cpp
3535

3636
## OPTIONS
3737

38+
#### **--api**=**llama-stack** | none**
39+
unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
40+
The default can be overridden in the ramalama.conf file.
41+
3842
#### **--authfile**=*password*
3943
path of the authentication file for OCI registries
4044

docs/ramalama.conf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717

1818
[ramalama]
1919

20+
# unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
21+
# Options: llama-stack, none
22+
#
23+
# api = "none"
24+
2025
# OCI model car image
2126
# Image to use when building and pushing --type=car models
2227
#

docs/ramalama.conf.5.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ The ramalama table contains settings to configure and manage the OCI runtime.
6060

6161
`[[ramalama]]`
6262

63+
**api**="none"
64+
65+
Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
66+
Options: llama-stack, none
67+
6368
**carimage**="registry.access.redhat.com/ubi9-micro:latest"
6469

6570
OCI model car image

libexec/ramalama/ramalama-client-core

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ import time
1111
import urllib.error
1212
import urllib.request
1313

14-
from ramalama.common import perror
15-
1614

1715
def should_colorize():
1816
t = os.getenv("TERM")
@@ -59,9 +57,14 @@ class RamaLamaShell(cmd.Cmd):
5957

6058
self.url = f"{parsed_args.host}/v1/chat/completions"
6159
self.models_url = f"{parsed_args.host}/v1/models"
62-
self.models = self.get_models()
63-
64-
def get_models(self):
60+
self.models = []
61+
62+
def model(self):
63+
if len(self.models) == 0:
64+
self.models=self._models()
65+
return self.models[0]
66+
67+
def _models(self):
6568
request = urllib.request.Request(self.models_url, method="GET")
6669
response = urllib.request.urlopen(request)
6770
for line in response:
@@ -91,7 +94,7 @@ class RamaLamaShell(cmd.Cmd):
9194
data = {
9295
"stream": True,
9396
"messages": self.conversation_history,
94-
"model": self.models[0],
97+
"model": self.model(),
9598
}
9699

97100
return data
@@ -126,7 +129,7 @@ class RamaLamaShell(cmd.Cmd):
126129
if response:
127130
return res(response, self.parsed_args.color)
128131

129-
perror(f"\rError: could not connect to: {self.url}")
132+
print(f"\rError: could not connect to: {self.url}", file=sys.stderr)
130133
self.kills(self.parsed_args)
131134

132135
return None

ramalama/cli.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@
2626
from ramalama.config import CONFIG
2727
from ramalama.migrate import ModelStoreImport
2828
from ramalama.model import MODEL_TYPES
29-
from ramalama.model_factory import ModelFactory
29+
from ramalama.model_factory import ModelFactory, New
3030
from ramalama.model_store import GlobalModelStore
3131
from ramalama.shortnames import Shortnames
32+
from ramalama.stack import Stack
3233
from ramalama.version import print_version, version
3334

3435
shortnames = Shortnames()
@@ -732,6 +733,13 @@ def push_cli(args):
732733

733734

734735
def runtime_options(parser, command):
736+
if command in ["run", "serve"]:
737+
parser.add_argument(
738+
"--api",
739+
default=CONFIG["api"],
740+
choices=["llama-stack", "none"],
741+
help="unified API layer for for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.",
742+
)
735743
parser.add_argument("--authfile", help="path of the authentication file")
736744
if command in ["run", "perplexity", "serve"]:
737745
parser.add_argument(
@@ -937,6 +945,13 @@ def serve_cli(args):
937945
if args.rag:
938946
_get_rag(args)
939947

948+
if args.api == "llama-stack":
949+
if not args.container:
950+
raise ValueError("ramalama serve --api llama-stack command cannot be run with the --nocontainer option.")
951+
952+
stack = Stack(args)
953+
return stack.serve()
954+
940955
try:
941956
model = New(args.MODEL, args)
942957
model.serve(args)
@@ -1086,10 +1101,6 @@ def rm_cli(args):
10861101
return _rm_model([model for model in models.keys()], args)
10871102

10881103

1089-
def New(model, args, transport=CONFIG["transport"]):
1090-
return ModelFactory(model, args, transport=transport).create()
1091-
1092-
10931104
def client_cli(args):
10941105
"""Handle client command execution"""
10951106
client_args = ["ramalama-client-core", "-c", "2048", "--temp", "0.8", args.HOST] + args.ARGS

ramalama/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def load_config_defaults(config: Dict[str, Any]):
8989
"MUSA_VISIBLE_DEVICES": "quay.io/ramalama/musa",
9090
},
9191
)
92+
config.setdefault('api', 'none')
9293
config.setdefault('keep_groups', False)
9394
config.setdefault('ngl', -1)
9495
config.setdefault('threads', -1)

ramalama/engine.py

Lines changed: 79 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ def __init__(self, args):
1717
"run",
1818
"--rm",
1919
]
20-
self.use_docker = os.path.basename(args.engine) == "docker"
21-
self.use_podman = os.path.basename(args.engine) == "podman"
20+
base = os.path.basename(args.engine)
21+
self.use_docker = base == "docker"
22+
self.use_podman = base == "podman"
2223
self.args = args
23-
self.add_container_labels()
24+
self.add_labels()
2425
self.add_device_options()
2526
self.add_env_option()
2627
self.add_network()
@@ -38,19 +39,11 @@ def __init__(self, args):
3839
def add_label(self, label):
3940
self.add(["--label", label])
4041

41-
def add_container_labels(self):
42-
label_map = {
43-
"MODEL": "ai.ramalama.model",
44-
"engine": "ai.ramalama.engine",
45-
"runtime": "ai.ramalama.runtime",
46-
"port": "ai.ramalama.port",
47-
"subcommand": "ai.ramalama.command",
48-
}
49-
for arg, label_prefix in label_map.items():
50-
if hasattr(self.args, arg):
51-
value = getattr(self.args, arg)
52-
if value:
53-
self.add_label(f"{label_prefix}={value}")
42+
def add_name(self, name):
43+
self.add(["--name", name])
44+
45+
def add_labels(self):
46+
add_labels(self.args, self.add_label)
5447

5548
def add_pull_newer(self):
5649
if not self.args.dryrun and self.use_docker and self.args.pull == "newer":
@@ -90,6 +83,9 @@ def add_privileged_options(self):
9083
"--security-opt=no-new-privileges",
9184
]
9285

86+
def cap_add(self, cap):
87+
self.exec_args += ["--cap-add", cap]
88+
9389
def add_subcommand_env(self):
9490
if EMOJI and hasattr(self.args, "subcommand") and self.args.subcommand == "run":
9591
if os.path.basename(self.args.engine) == "podman":
@@ -111,7 +107,12 @@ def add_detach_option(self):
111107
self.exec_args += ["-d"]
112108

113109
def add_port_option(self):
114-
if hasattr(self.args, "port"):
110+
if not hasattr(self.args, "port") or not self.args.port or self.args.port == "":
111+
return
112+
113+
if self.args.port.count(":") > 0:
114+
self.exec_args += ["-p", self.args.port]
115+
else:
115116
self.exec_args += ["-p", f"{self.args.port}:{self.args.port}"]
116117

117118
def add_device_options(self):
@@ -243,26 +244,79 @@ def info(args):
243244
return str(e)
244245

245246

246-
def stop_container(args, name):
247+
def inspect(args, name, format=None, ignore_stderr=False):
247248
if not name:
248249
raise ValueError("must specify a container name")
249250
conman = args.engine
250251
if conman == "":
251252
raise ValueError("no container manager (Podman, Docker) found")
252253

253-
conman_args = [conman, "stop", "-t=0"]
254-
ignore_stderr = False
255-
if args.ignore:
256-
if conman == "podman":
257-
conman_args += ["--ignore", str(args.ignore)]
258-
else:
259-
ignore_stderr = True
254+
conman_args = [conman, "inspect"]
255+
if format:
256+
conman_args += ["--format", format]
260257

261258
conman_args += [name]
259+
return run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug).stdout.decode("utf-8").strip()
260+
261+
262+
def stop_container(args, name):
263+
if not name:
264+
raise ValueError("must specify a container name")
265+
conman = args.engine
266+
if conman == "":
267+
raise ValueError("no container manager (Podman, Docker) found")
268+
269+
ignore_stderr = False
270+
pod = ""
271+
try:
272+
pod = inspect(args, name, format="{{ .Pod }}", ignore_stderr=True)
273+
except Exception: # Ignore errors, the stop command will handle it.
274+
pass
275+
276+
if pod != "":
277+
conman_args = [conman, "pod", "rm", "-t=0", "--ignore", "--force", pod]
278+
else:
279+
conman_args = [conman, "stop", "-t=0"]
280+
if args.ignore:
281+
if conman == "podman":
282+
conman_args += ["--ignore", str(args.ignore)]
283+
else:
284+
ignore_stderr = True
285+
286+
conman_args += [name]
262287
try:
263288
run_cmd(conman_args, ignore_stderr=ignore_stderr, debug=args.debug)
264289
except subprocess.CalledProcessError:
265290
if args.ignore and conman == "docker":
266291
return
267292
else:
268293
raise
294+
295+
296+
def container_connection(args, name, port):
297+
if not name:
298+
raise ValueError("must specify a container name")
299+
if not port:
300+
raise ValueError("must specify a port to check")
301+
302+
conman = args.engine
303+
if conman == "":
304+
raise ValueError("no container manager (Podman, Docker) found")
305+
306+
conman_args = [conman, "port", name, port]
307+
output = run_cmd(conman_args, debug=args.debug).stdout.decode("utf-8").strip()
308+
return "" if output == "" else output.split(">")[-1].strip()
309+
310+
311+
def add_labels(args, add_label):
312+
label_map = {
313+
"MODEL": "ai.ramalama.model",
314+
"engine": "ai.ramalama.engine",
315+
"runtime": "ai.ramalama.runtime",
316+
"port": "ai.ramalama.port",
317+
"subcommand": "ai.ramalama.command",
318+
}
319+
for arg, label_prefix in label_map.items():
320+
if hasattr(args, arg):
321+
if value := getattr(args, arg):
322+
add_label(f"{label_prefix}={value}")

ramalama/model.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,6 @@ def add_rag(self, exec_args, args):
248248
def setup_container(self, args):
249249
name = self.get_container_name(args)
250250
self.base(args, name)
251-
self.engine.add_container_labels()
252251

253252
def gpu_args(self, args, runner=False):
254253
gpu_args = []
@@ -292,6 +291,9 @@ def exec_model_in_container(self, model_path, cmd_args, args):
292291
self.setup_mounts(model_path, args)
293292
self.handle_rag_mode(args, cmd_args)
294293

294+
# Make sure Image precedes cmd_args
295+
self.engine.add([accel_image(CONFIG, args)] + cmd_args)
296+
295297
if args.dryrun:
296298
self.engine.dryrun()
297299
return True
@@ -339,9 +341,6 @@ def handle_rag_mode(self, args, cmd_args):
339341
if hasattr(args, "rag") and args.rag:
340342
args.image = args.image.split(":")[0]
341343

342-
# Make sure Image precedes cmd_args
343-
self.engine.add([accel_image(CONFIG, args)] + cmd_args)
344-
345344
def bench(self, args):
346345
model_path = self.get_model_path(args)
347346
exec_args = self.build_exec_args_bench(args, model_path)
@@ -616,13 +615,13 @@ def execute_command(self, model_path, exec_args, args):
616615

617616
def serve(self, args, quiet=False):
618617
self.validate_args(args)
619-
args.port = compute_serving_port(args.port, args.debug, quiet)
620618
model_path = self.get_model_path(args)
621619
if is_split_file_model(model_path):
622620
mnt_file = MNT_DIR + '/' + self.mnt_path
623621
else:
624622
mnt_file = MNT_FILE
625623

624+
args.port = compute_serving_port(args, quiet=quiet or args.generate)
626625
exec_model_path = mnt_file if args.container or args.generate else model_path
627626
chat_template_path = ""
628627
mmproj_path = ""
@@ -730,16 +729,20 @@ def get_available_port_if_any(debug: bool) -> int:
730729
return chosen_port
731730

732731

733-
def compute_serving_port(port: str, debug: bool, quiet=False) -> str:
732+
def compute_serving_port(args, quiet=False) -> str:
734733
# user probably specified a custom port, don't override the choice
735-
if port != "" and port != str(DEFAULT_PORT):
736-
return port
737-
738-
# otherwise compute a random serving port in the range
739-
target_port = get_available_port_if_any(debug)
734+
if args.port not in ["", str(DEFAULT_PORT)]:
735+
target_port = args.port
736+
else:
737+
# otherwise compute a random serving port in the range
738+
target_port = get_available_port_if_any(args.debug)
740739

741740
if target_port == 0:
742741
raise IOError("no available port could be detected. Please ensure you have enough free ports.")
743742
if not quiet:
744-
print(f"serving on port {target_port}")
743+
openai = f"http://localhost:{target_port}"
744+
if args.api == "llama-stack":
745+
print(f"LamaStack RESTAPI: {openai}")
746+
openai = openai + "/v1/openai"
747+
print(f"OpenAI RESTAPI: {openai}")
745748
return str(target_port)

0 commit comments

Comments
 (0)