diff --git a/exo/api/chatgpt_api.py b/exo/api/chatgpt_api.py index fe8cc5906..9a65deaec 100644 --- a/exo/api/chatgpt_api.py +++ b/exo/api/chatgpt_api.py @@ -71,16 +71,9 @@ def generate_completion( } choice = completion["choices"][0] - print(f"\nchoice {choice}") if object_type.startswith("chat.completion"): key_name = "delta" if stream else "message" - - token_decode = tokenizer.batch_decode( - tokens, - skip_special_tokens=True, - clean_up_tokenization_spaces=False - ) - choice[key_name] = {"role": "assistant", "content": token_decode} + choice[key_name] = {"role": "assistant", "content": tokenizer.decode(tokens)} elif object_type == "text_completion": choice["text"] = tokenizer.decode(tokens) else: diff --git a/exo/inference/pytorch/inference.py b/exo/inference/pytorch/inference.py index 11f8eddb3..676e31620 100644 --- a/exo/inference/pytorch/inference.py +++ b/exo/inference/pytorch/inference.py @@ -14,6 +14,7 @@ from exo.download.hf.hf_shard_download import HFShardDownloader from transformers import AutoTokenizer + # llama from transformers.models.llama.modeling_llama import LlamaModel