You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
model_kwargs = {
"n_ctx":8192, # Context length to use
"n_threads":4, # Number of CPU threads to use
"n_gpu_layers": 20 ,# Number of model layers to offload to GPU. Set to 0 if only using CPU
}
generation_kwargs = {
"max_tokens":2000, # Max number of new tokens to generate
# "stop":["<|endoftext|>", ""], # Text sequences to stop generation on
"echo":False, # Echo the prompt in the output
"top_k":3 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
}
def chat(messages):
res = llm.create_chat_completion(
messages=messages
)
print(res['choices'][0]['message']['content'])
if name == 'main':
while True:
prompt = input()
messages = [
{"role": "user","content": prompt}
]
chat(messages)
questions:
why do not use gpu?
The text was updated successfully, but these errors were encountered:
install
pip install llama-cpp-python
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 --no-cache-dir
code
from llama_cpp import Llama
model_path="/root/model/Llama3.1-8B-Chinese-Chat-gguf/Llama3.1-8B-Chinese-Chat.Q3_K_M.gguf"
model_kwargs = {
"n_ctx":8192, # Context length to use
"n_threads":4, # Number of CPU threads to use
"n_gpu_layers": 20 ,# Number of model layers to offload to GPU. Set to 0 if only using CPU
}
llm = Llama(model_path=model_path, **model_kwargs)
generation_kwargs = {
"max_tokens":2000, # Max number of new tokens to generate
# "stop":["<|endoftext|>", ""], # Text sequences to stop generation on
"echo":False, # Echo the prompt in the output
"top_k":3 # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
}
def chat(messages):
res = llm.create_chat_completion(
messages=messages
)
if name == 'main':
while True:
prompt = input()
messages = [
{"role": "user","content": prompt}
]
chat(messages)
questions:
why do not use gpu?
The text was updated successfully, but these errors were encountered: