Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.
2 changes: 1 addition & 1 deletion examples/offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="facebook/opt-125m")
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_quantized_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# Create an LLM.
llm = LLM(
model=model_path,
model="nm-testing/Nous-Hermes-Llama2-13b-smoothquant",
gpu_memory_utilization=0.9,
max_model_len=2048,
quantization="smoothquant",
Expand Down
35 changes: 35 additions & 0 deletions examples/simple_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import argparse
from vllm import LLM, SamplingParams

MODELS = {
"tinyllama-fp16": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"tinyllama-marlin": "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin",
"tinyllama-gptq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"tinyllama-awq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
}

parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--tensor-parallel-size", type=int, default=1)
args = parser.parse_args()

if args.model not in MODELS:
print(f"Got model id of {args.model}; Must be in {list(MODELS.keys())}")
raise ValueError
else:
model_id = MODELS[args.model]
print(f"Using model_id = {model_id}")

messages=[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is deep learning?"
}]

model = LLM(model_id, enforce_eager=True, max_model_len=2048, tensor_parallel_size=args.tensor_parallel_size, dtype="float16")
prompt = model.llm_engine.tokenizer.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
out = model.generate(prompt, SamplingParams(max_tokens=50))
print(f"\n-----prompt\n{prompt}")
print(f"\n-----generation\n{out[0].outputs[0].text}")
4 changes: 2 additions & 2 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ def _verify_tokenizer_mode(self) -> None:
self.tokenizer_mode = tokenizer_mode

def _verify_quantization(self) -> None:
supported_quantization = ["awq", "gptq", "squeezellm", "smoothquant"]
rocm_not_supported_quantization = ["awq", "marlin"]
supported_quantization = ["awq", "gptq", "marlin", "squeezellm", "smoothquant"]
rocm_not_supported_quantization = ["awq", "marlin", "smoothquant"]
if self.quantization is not None:
self.quantization = self.quantization.lower()

Expand Down
Loading