diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md index d15cf791..f57d99c4 100644 --- a/docs/ramalama.1.md +++ b/docs/ramalama.1.md @@ -77,6 +77,9 @@ show this help message and exit do not run ramalama in the default container (default: False) use environment variable "RAMALAMA_IN_CONTAINER=false" to change default. +#### **--runtime** +specify the runtime to use, valid options are 'llama.cpp' and 'vllm' (default: llama.cpp) + #### **--store**=STORE store AI Models in the specified directory (default rootless: `$HOME/.local/share/ramalama`, default rootful: `/var/lib/ramalama`) diff --git a/ramalama/cli.py b/ramalama/cli.py index 0732ba30..264cb2c7 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -38,6 +38,12 @@ def init_cli(): ) parser.add_argument("--store", default=get_store(), help="store AI Models in the specified directory") parser.add_argument("--dryrun", action="store_true", help="show container runtime command without executing it") + parser.add_argument( + "--runtime", + default="llama.cpp", + choices=["llama.cpp", "vllm"], + help="specify the runtime to use, valid options are 'llama.cpp' and 'vllm'", + ) parser.add_argument( "--nocontainer", default=not use_container(), @@ -316,7 +322,7 @@ def run_parser(subparsers): parser.add_argument("--prompt", dest="prompt", action="store_true", help="modify chatbot prompt") parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run") parser.add_argument("MODEL") # positional argument - parser.add_argument("ARGS", nargs="*", help="Additional options to pass to the AI Model") + parser.add_argument("ARGS", nargs="*", help="additional options to pass to the AI Model") parser.set_defaults(func=run_cli) diff --git a/ramalama/model.py b/ramalama/model.py index 92a2d228..e94d40c4 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -102,5 +102,8 @@ def run(self, args): def serve(self, args): symlink_path = self.pull(args) - exec_args = ["llama-server", "--port", args.port, "-m", symlink_path] + self.common_params + exec_args = ["llama-server", "--port", args.port, "-m", symlink_path] + if args.runtime == "vllm": + exec_args = ["vllm", "serve", "--port", args.port, symlink_path] + exec_cmd(exec_args)