Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DeepSeek VLM Support #32

Merged
merged 12 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 50 additions & 12 deletions Pipes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
import logging
from dotenv import load_dotenv
from ezlocalai.LLM import LLM
from ezlocalai.LLM import LLM, is_vision_model
from ezlocalai.STT import STT
from ezlocalai.CTTS import CTTS
from pyngrok import ngrok
import requests
import base64

try:
from ezlocalai.IMG import IMG
Expand All @@ -13,10 +15,23 @@
except ImportError:
img_import_success = False

from ezlocalai.VLM import VLM


class Pipes:
def __init__(self):
load_dotenv()
self.current_vlm = os.getenv("VISION_MODEL", "")
logging.info(f"[VLM] {self.current_vlm} model loading. Please wait...")
self.vlm = None
if self.current_vlm != "":
try:
self.vlm = VLM(model=self.current_vlm)
except Exception as e:
logging.error(f"[VLM] Failed to load the model: {e}")
self.vlm = None
if self.vlm is not None:
logging.info(f"[ezlocalai] Vision is enabled.")
self.img_enabled = os.getenv("IMG_ENABLED", "false").lower() == "true"
self.img = None
if self.img_enabled and img_import_success:
Expand All @@ -38,14 +53,15 @@ def __init__(self):
logging.info(f"[STT] {self.current_stt} model loaded successfully.")
DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "phi-2-dpo")
self.current_llm = DEFAULT_MODEL if DEFAULT_MODEL else "phi-2-dpo"
logging.info(f"[LLM] {self.current_llm} model loading. Please wait...")
self.llm = LLM(model=self.current_llm)
logging.info(f"[LLM] {self.current_llm} model loaded successfully.")
self.current_vlm = os.getenv("VISION_MODEL", "")
self.vlm = None
if self.current_vlm != "":
self.vlm = LLM(model=self.current_vlm) # bakllava-1-7b
logging.info(f"[ezlocalai] Vision is enabled.")
if self.vlm is not None:
self.llm = self.vlm
else:
logging.info(f"[LLM] {self.current_llm} model loading. Please wait...")
self.llm = LLM(model=self.current_llm)
if is_vision_model(self.current_llm):
if self.vlm is None:
self.vlm = self.llm
logging.info(f"[LLM] {self.current_llm} model loaded successfully.")
NGROK_TOKEN = os.environ.get("NGROK_TOKEN", "")
if NGROK_TOKEN:
ngrok.set_auth_token(NGROK_TOKEN)
Expand All @@ -57,7 +73,30 @@ def __init__(self):

async def get_response(self, data, completion_type="chat"):
data["local_uri"] = self.local_uri

if "messages" in data:
if isinstance(data["messages"][-1]["content"], list):
messages = data["messages"][-1]["content"]
for message in messages:
if "text" in message:
prompt = message["text"]
for message in messages:
if "audio_url" in message:
audio_url = (
message["audio_url"]["url"]
if "url" in message["audio_url"]
else message["audio_url"]
)
audio_format = "wav"
if audio_url.startswith("data:"):
audio_url = audio_url.split(",")[1]
audio_format = audio_url.split(";")[0]
else:
audio_url = requests.get(audio_url).content
audio_url = base64.b64encode(audio_url).decode("utf-8")
transcribed_audio = self.stt.transcribe_audio(
base64_audio=audio_url, audio_format=audio_format
)
prompt = f"Transcribed Audio: {transcribed_audio}\n\n{prompt}"
if data["model"]:
if self.current_llm != data["model"]:
data["model"] = self.current_llm
Expand Down Expand Up @@ -99,7 +138,7 @@ async def get_response(self, data, completion_type="chat"):
if completion_type != "chat"
else response["choices"][0]["message"]["content"]
)
img_gen_prompt = f"Users message: {user_message} \nAssistant response: {response_text} \n\n**The assistant is acting as a decision maker for creating stable diffusion images and only responds with a concise YES or NO answer on if it would make sense to generate an image based on the users message. No other explanation is needed!**\nShould an image be created to accompany the assistant response?\nAssistant: "
img_gen_prompt = f"Users message: {user_message} \nAssistant response: {response_text} \n\n**The assistant is acting as sentiment analysis expert and only responds with a concise YES or NO answer on if the user would like an image as visual or a picture generated. No other explanation is needed!**\nShould an image be created to accompany the assistant response?\nAssistant: "
logging.info(f"[IMG] Decision maker prompt: {img_gen_prompt}")
create_img = self.llm.chat(
messages=[{"role": "system", "content": img_gen_prompt}],
Expand All @@ -110,7 +149,6 @@ async def get_response(self, data, completion_type="chat"):
create_img = str(create_img["choices"][0]["message"]["content"]).lower()
logging.info(f"[IMG] Decision maker response: {create_img}")
if "yes" in create_img or "es," in create_img:

prompt = (
data["messages"][-1]["content"]
if completion_type == "chat"
Expand Down
4 changes: 4 additions & 0 deletions cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ COPY cuda-requirements.txt .
RUN python3 -m pip install --upgrade pip cmake scikit-build setuptools wheel --no-cache-dir && \
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.55 --no-cache-dir && \
pip install --no-cache-dir -r cuda-requirements.txt
RUN git clone https://github.com/Josh-XT/DeepSeek-VL deepseek && \
cd deepseek && \
pip install --no-cache-dir -e . && \
cd ..
COPY . .
EXPOSE 8091
ENTRYPOINT ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8091", "--workers", "1", "--proxy-headers"]
158 changes: 22 additions & 136 deletions ezlocalai/LLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
from bs4 import BeautifulSoup
from typing import List, Optional, Dict
import os
import re
import requests
import tiktoken
import json
import psutil
import torch
import logging
Expand Down Expand Up @@ -72,62 +69,6 @@ def get_model_url(model_name=""):
return model_url


def get_tokens(text: str) -> int:
encoding = tiktoken.get_encoding("cl100k_base")
num_tokens = len(encoding.encode(text))
return int(num_tokens)


def get_model_name(model_url="TheBloke/phi-2-dpo-GGUF"):
model_name = model_url.split("/")[-1].replace("-GGUF", "").lower()
return model_name


def get_readme(model_name="", models_dir="models"):
if model_name == "":
global DEFAULT_MODEL
model_name = DEFAULT_MODEL
model_url = get_model_url(model_name=model_name)
model_name = model_name.lower()
if not os.path.exists(f"{models_dir}/{model_name}/README.md"):
readme_url = f"https://huggingface.co/{model_url}/raw/main/README.md"
with requests.get(readme_url, stream=True, allow_redirects=True) as r:
with open(f"{models_dir}/{model_name}/README.md", "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
with open(f"{models_dir}/{model_name}/README.md", "r", encoding="utf-8") as f:
readme = f.read()
return readme


def get_prompt(model_name="", models_dir="models"):
if model_name == "":
global DEFAULT_MODEL
model_name = DEFAULT_MODEL
model_name = model_name.lower()
if os.path.exists(f"{models_dir}/{model_name}/prompt.txt"):
with open(f"{models_dir}/{model_name}/prompt.txt", "r") as f:
prompt_template = f.read()
return prompt_template
readme = get_readme(model_name=model_name, models_dir=models_dir)
try:
prompt_template = readme.split("prompt_template: '")[1].split("'")[0]
except:
prompt_template = ""
if prompt_template == "":
prompt_template = (
"## SYSTEM\n{system_message}\n## USER\n{prompt}\n## ASSISTANT\n"
)
if "{system_message}" not in prompt_template:
if "<|system|>" in prompt_template:
prompt_template = prompt_template.replace(
"<|system|>", "<|system|>\n{system_message}"
)
else:
prompt_template = "## SYSTEM\n{system_message}\n" + prompt_template
return prompt_template


def download_llm(model_name="", models_dir="models"):
if model_name != "":
global DEFAULT_MODEL
Expand Down Expand Up @@ -191,39 +132,6 @@ def get_clip_path(model_name="", models_dir="models"):
return ""


def custom_format(string, **kwargs):
if isinstance(string, list):
string = "".join(str(x) for x in string)

def replace(match):
key = match.group(1)
value = kwargs.get(key, match.group(0))
if isinstance(value, list):
return "".join(str(x) for x in value)
else:
return str(value)

pattern = r"(?<!{){([^{}\n]+)}(?!})"
result = re.sub(pattern, replace, string)
return result


def custom_format_prompt(prompt, prompt_template, system_message=""):
formatted_prompt = custom_format(
string=prompt_template, prompt=prompt, system_message=system_message
)
return formatted_prompt


async def streaming_generation(data):
yield "data: {}\n".format(json.dumps(data))
for line in data.iter_lines():
if line:
decoded_line = line.decode("utf-8")
current_data = json.loads(decoded_line[6:])
yield "data: {}\n".format(json.dumps(current_data))


def clean(
message: str = "",
stop_tokens: List[str] = [
Expand All @@ -234,6 +142,7 @@ def clean(
"<s>",
"User:",
"### \n###",
"[/INST]",
],
):
if message == "":
Expand Down Expand Up @@ -298,9 +207,6 @@ def __init__(
self.params["max_tokens"] = 4096
else:
self.params["max_tokens"] = max_tokens
self.prompt_template = get_prompt(
model_name=self.model_name, models_dir=models_dir
)
if is_vision_model(model_name=self.model_name):
clip_path = get_clip_path(
model_name=self.model_name, models_dir=models_dir
Expand All @@ -312,8 +218,7 @@ def __init__(
else:
self.params["model_path"] = ""
self.params["max_tokens"] = 8192
self.prompt_template = "{system_message}\n\n{prompt}"
self.params["n_ctx"] = int(os.environ.get("LLM_MAX_TOKENS", 0))
self.params["n_ctx"] = int(os.environ.get("LLM_MAX_TOKENS", 4096))
self.params["verbose"] = True
self.system_message = system_message
self.params["mirostat_mode"] = 2
Expand All @@ -326,6 +231,7 @@ def __init__(
"<s>",
"User:",
"### \n###",
"[/INST]",
]
if stop != []:
if isinstance(stop, str):
Expand Down Expand Up @@ -359,16 +265,19 @@ def __init__(
else:
self.params["n_batch"] = 1024
if self.model_name != "":
logging.info(f"[LLM] Parameters: {self.params}")
self.lcpp = Llama(**self.params, embedding=True, chat_handler=chat_handler)
self.lcpp = Llama(
**self.params,
embedding=True,
chat_handler=chat_handler,
logits_all=True if chat_handler else False,
)
else:
self.lcpp = None
self.model_list = get_models()

def generate(
self,
prompt,
format_prompt: bool = True,
max_tokens=None,
temperature=None,
top_p=None,
Expand Down Expand Up @@ -405,16 +314,8 @@ def generate(
"content": prompt,
},
)
if format_prompt:
formatted_prompt = custom_format_prompt(
prompt=prompt,
prompt_template=self.prompt_template,
system_message=(
self.system_message if system_message is None else system_message
),
)
data = self.lcpp.create_completion(
prompt=formatted_prompt if format_prompt else prompt,
data = self.lcpp.create_chat_completion(
messages=messages,
max_tokens=(
self.params["max_tokens"] if max_tokens is None else int(max_tokens)
),
Expand Down Expand Up @@ -449,37 +350,22 @@ def generate(
data["model"] = self.model_name
return data

def completion(self, prompt, format_prompt: bool = True, **kwargs):
data = self.generate(prompt=prompt, format_prompt=format_prompt, **kwargs)
data["choices"][0]["text"] = clean(
message=data["choices"][0]["text"], stop_tokens=self.params["stop"]
def completion(self, prompt, **kwargs):
data = self.generate(prompt=prompt, **kwargs)
data["choices"][0]["message"]["content"] = clean(
message=data["choices"][0]["message"]["content"],
stop_tokens=self.params["stop"],
)
data["choices"][0]["text"] = data["choices"][0]["message"]["content"]
return data

def chat(self, messages, **kwargs):
prompt = ""
if len(messages) > 1:
for message in messages:
if message["role"] == "system":
kwargs["system_message"] = message["content"]
elif message["role"] == "user":
prompt += f"USER: {message['content']}"
elif message["role"] == "assistant":
prompt += f"ASSISTANT: {message['content']}"
prompt += "\n"
else:
try:
prompt = messages[0]["content"]
except:
prompt = str(messages)
data = self.generate(prompt=prompt, **kwargs)
messages = [{"role": "user", "content": prompt}]
message = clean(
message=data["choices"][0]["text"], stop_tokens=self.params["stop"]
user_input = messages[-1]["content"]
data = self.generate(prompt=user_input, **kwargs)
data["choices"][0]["message"]["content"] = clean(
message=data["choices"][0]["message"]["content"],
stop_tokens=self.params["stop"],
)
data["choices"][0]["message"] = {"content": message}
messages.append({"role": "assistant", "content": message})
data["messages"] = messages
return data

def embedding(self, input):
Expand Down
Loading