Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/ramalama-run.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ Show this help message and exit
#### **--keepalive**
duration to keep a model loaded (e.g. 5m)

#### **--max-model-len**
Maximum model length for vLLM (default: 2048)

#### **--name**, **-n**
name of the container to run the Model in

Expand Down
3 changes: 3 additions & 0 deletions docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ show this help message and exit
#### **--host**="0.0.0.0"
IP address for llama.cpp to listen on.

#### **--max-model-len**
Maximum model length for vLLM (default: 2048)

#### **--model-draft**


Expand Down
7 changes: 7 additions & 0 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,13 @@ def runtime_options(parser, command):
parser.add_argument(
"--rag", help="RAG vector database or OCI Image to be served with the model", completer=local_models
)
parser.add_argument(
"--max-model-len",
dest="vllm_max_model_len",
type=int,
help="Maximum model length for vLLM",
completer=suppressCompleter,
)
if command in ["perplexity", "run", "serve"]:
parser.add_argument(
"--runtime-args",
Expand Down
58 changes: 54 additions & 4 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,28 @@ def exec_model_in_container(self, model_path, cmd_args, args):
return True

def setup_mounts(self, model_path, args):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): Low code quality found in Model.setup_mounts - 16% (low-code-quality)


ExplanationThe quality score for this function is below the quality threshold of 25%.
This score is a combination of the method length, cognitive complexity and working memory.

How can you solve this?

It might be worth refactoring this function to make it shorter and more readable.

  • Reduce the function length by extracting pieces of functionality out into
    their own functions. This is the most important thing you can do - ideally a
    function should be less than 10 lines.
  • Reduce nesting, perhaps by introducing guard clauses to return early.
  • Ensure that variables are tightly scoped, so that code using related concepts
    sits together within the function rather than being scattered.

if model_path and os.path.exists(model_path):
if args.runtime == "vllm":
model_base = ""
if self.store and hasattr(self, 'model_tag'):
ref_file = self.store.get_ref_file(self.model_tag)
if ref_file and hasattr(ref_file, 'hash'):
model_base = self.store.model_base_directory
if not model_base:
# Might be needed for file:// paths directly used with vLLM.
if model_path and os.path.exists(model_path):
if os.path.isfile(model_path):
model_base = os.path.dirname(model_path)
elif os.path.isdir(model_path):
model_base = model_path
if model_base:
self.engine.add([f"--mount=type=bind,src={model_base},destination={MNT_DIR},ro"])
else:
raise ValueError(
f'Could not determine a valid host directory to mount for model {self.model}'
+ 'Ensure the model path is correct or the model store is properly configured.'
)

elif model_path and os.path.exists(model_path):
if hasattr(self, 'split_model'):
self.engine.add([f"--mount=type=bind,src={model_path},destination={MNT_DIR}/{self.mnt_path},ro"])

Expand Down Expand Up @@ -531,9 +552,38 @@ def build_exec_args_serve(self, args, exec_model_path, chat_template_path="", mm
def handle_runtime(self, args, exec_args, exec_model_path):
set_accel_env_vars()
if args.runtime == "vllm":
exec_model_path = os.path.dirname(exec_model_path)
# Left out "vllm", "serve" the image entrypoint already starts it
exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"]
container_model_path = ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): We've found these issues:

ref_file = None
if self.store:
ref_file = self.store.get_ref_file(self.model_tag)

if ref_file and ref_file.hash:
snapshot_dir_name = ref_file.hash
container_model_path = os.path.join(MNT_DIR, "snapshots", snapshot_dir_name)
else:
current_model_host_path = self.get_model_path(args)
if os.path.isdir(current_model_host_path):
container_model_path = MNT_DIR
else:
container_model_path = os.path.join(MNT_DIR, os.path.basename(current_model_host_path))

vllm_max_model_len = 2048
if args.vllm_max_model_len:
vllm_max_model_len = args.vllm_max_model_len

exec_args = [
"--port",
str(args.port),
"--model",
str(container_model_path),
"--max_model_len",
str(vllm_max_model_len),
"--served-model-name",
self.model_name,
]

if hasattr(args, 'runtime_args') and args.runtime_args:
exec_args.extend(args.runtime_args)
else:
gpu_args = self.gpu_args(args=args)
if gpu_args is not None:
Expand Down
10 changes: 6 additions & 4 deletions test/system/040-serve.bats
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,11 @@ verify_begin=".*run --rm"
rm $name.image
fi

run_ramalama rm oci://${ociimage}
done
stop_registry
skip "vLLM cant serve GGUFs, needs tiny safetensor"

run_ramalama --runtime=vllm serve --authfile=$authfile --tls-verify=false --name=${name} --port 1234 --generate=kube oci://${ociimage}
is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"

Expand All @@ -265,10 +270,7 @@ verify_begin=".*run --rm"
is "$output" ".*reference: ${ociimage}" "AI image should be created"
is "$output" ".*pullPolicy: IfNotPresent" "pullPolicy should exist"

run_ramalama rm oci://${ociimage}
rm $name.yaml
done
stop_registry
rm $name.yaml
}


Expand Down