Skip to content
42 changes: 42 additions & 0 deletions packages/tasks/src/local-apps.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,41 @@ LLAMA_CURL=1 make
];
};

const snippetVllm = (model: ModelData): string[] => {
return [
`
## Deploy with docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo):
docker run --runtime nvidia --gpus all \
--name my_vllm_container \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model ${model.id}
`,
`
## Load and run the model
docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123"
`,
`
## Call the server using the official OpenAI Python client library, or any other HTTP client
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
completion = client.chat.completions.create(
model=${model.id},
messages=[
{"role": "user", "content": "Hello!"}
]
)
print(completion.choices[0].message)
`,
];
};

/**
* Add your new local app here.
*
Expand All @@ -82,6 +117,13 @@ export const LOCAL_APPS = {
displayOnModelPage: isGgufModel,
snippet: snippetLlamacpp,
},
"vllm": {
prettyLabel: "vLLM",
docsUrl: "https://docs.vllm.ai",
mainTask: "text-generation",
displayOnModelPage: isGptqModel && isAwqModel,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how would you define those methods?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how would you define those methods?

In fact, the suggested vLLM method deploys the non-quantized version from the Hugginface repository. All examples of type "text-generation" in the code are GGUF. Any suggestion?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the PR! Concretely we support a set of architectures which is readable from the model data

architectures?: string[];

https://github.com/vllm-project/vllm/blob/757b62c49560baa6f294310a53032348a0d95939/vllm/model_executor/models/__init__.py#L13-L63

And for quantization method we can read in config.quantization_config.quant_method which we support awq, gptq, aqlm, and marlin

https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json#L28

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

awesome @simon-mo, super clear!

Copy link
Member

@julien-c julien-c May 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've pushed 2123430 on this PR to type config.quantization_config.quant_method which we now parse & pass from the Hub

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've pushed 2123430 on this PR to type config.quantization_config.quant_method which we now parse & pass from the Hub

I made some changes, I need your help to review

snippet: snippetVllm,
},
lmstudio: {
prettyLabel: "LM Studio",
docsUrl: "https://lmstudio.ai",
Expand Down