neuralmagic · varun-sundar-rabindranath · Apr 16, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -11,7 +11,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)

diff --git a/examples/offline_quantized_inference.py b/examples/offline_quantized_inference.py
@@ -17,7 +17,7 @@
 
 # Create an LLM.
 llm = LLM(
-    model=model_path,
+    model="nm-testing/Nous-Hermes-Llama2-13b-smoothquant",
     gpu_memory_utilization=0.9,
     max_model_len=2048,
     quantization="smoothquant",

diff --git a/examples/simple_test.py b/examples/simple_test.py
@@ -0,0 +1,35 @@
+import argparse
+from vllm import LLM, SamplingParams
+
+MODELS = {
+    "tinyllama-fp16": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "tinyllama-marlin": "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin",
+    "tinyllama-gptq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "tinyllama-awq": "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
+}
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str)
+parser.add_argument("--tensor-parallel-size", type=int, default=1)
+args = parser.parse_args()
+
+if args.model not in MODELS:
+    print(f"Got model id of {args.model}; Must be in {list(MODELS.keys())}")
+    raise ValueError
+else:
+    model_id = MODELS[args.model]
+    print(f"Using model_id = {model_id}")
+
+messages=[{
+    "role": "system",
+    "content": "You are a helpful assistant."
+}, {
+    "role": "user",
+    "content": "What is deep learning?"
+}]
+
+model = LLM(model_id, enforce_eager=True, max_model_len=2048, tensor_parallel_size=args.tensor_parallel_size, dtype="float16")
+prompt = model.llm_engine.tokenizer.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+out = model.generate(prompt, SamplingParams(max_tokens=50))
+print(f"\n-----prompt\n{prompt}")
+print(f"\n-----generation\n{out[0].outputs[0].text}")
diff --git a/vllm/config.py b/vllm/config.py
@@ -173,8 +173,8 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm", "smoothquant"]
-        rocm_not_supported_quantization = ["awq", "marlin"]
+        supported_quantization = ["awq", "gptq", "marlin", "squeezellm", "smoothquant"]
+        rocm_not_supported_quantization = ["awq", "marlin", "smoothquant"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()