From e5a332785c1e3f84b6da7bccc24a68cc0b83d222 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 00:17:12 +0800 Subject: [PATCH 1/9] feat: add deepseek-OCR transformers/vllm Signed-off-by: weedge --- deploy/modal/src/download_models.py | 2 +- .../src/llm/transformers/vlm/ocr_deepseek.py | 204 ++++++++++++++ deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py | 265 ++++++++++++++++++ 3 files changed, 470 insertions(+), 1 deletion(-) create mode 100644 deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py create mode 100644 deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py diff --git a/deploy/modal/src/download_models.py b/deploy/modal/src/download_models.py index 7106448c..44b213a0 100644 --- a/deploy/modal/src/download_models.py +++ b/deploy/modal/src/download_models.py @@ -50,7 +50,7 @@ def download_ckpt( local_dir = os.path.join(HF_MODEL_DIR, repo_id) else: local_dir = os.path.join(HF_MODEL_DIR, local_dir) - print(f"{repo_id} model downloading, {ignore_patterns=}") + print(f"{repo_id} model downloading, {ignore_patterns=} {revision=}") snapshot_download( repo_id=repo_id, revision=revision, diff --git a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py new file mode 100644 index 00000000..4f9c08b3 --- /dev/null +++ b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py @@ -0,0 +1,204 @@ +import os +import sys +import asyncio +import subprocess +from pathlib import Path +from threading import Thread + + +import modal + + +app = modal.App("deepseek-ocr") +IMAGE_GPU = os.getenv("IMAGE_GPU", None) +img = ( + # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags + modal.Image.from_registry( + "nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04", + add_python="3.10", + ) + .apt_install("git", "git-lfs") + .run_commands( + "git clone https://github.com/deepseek-ai/DeepSeek-OCR.git", + "cd /DeepSeek-OCR && pip install -r requirements.txt", + ) + .pip_install( + "torch==2.6.0", + "torchvision==0.21.0", + "torchaudio==2.6.0", + "wheel", + "matplotlib", + "accelerate>=0.26.0", # for device_map="auto" model loading with safetensors slipt + ) + # .apt_install("clang", "cmake", "ninja-build") + # https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1 for torch 2.6.0 + .pip_install("flash-attn==2.7.4.post1", extra_options="--no-build-isolation") + .env( + { + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "ACHATBOT_PKG": "1", + "CUDA_VISIBLE_DEVICES": "0", + "LLM_MODEL": os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR"), + } + ) +) + +# img = img.pip_install( +# f"achatbot==0.0.25.dev122", +# extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"), +# ) + + +HF_MODEL_DIR = "/root/.achatbot/models" +hf_model_vol = modal.Volume.from_name("models", create_if_missing=True) +ASSETS_DIR = "/root/.achatbot/assets" +assets_vol = modal.Volume.from_name("assets", create_if_missing=True) +CONFIG_DIR = "/root/.achatbot/config" +config_vol = modal.Volume.from_name("config", create_if_missing=True) + +TORCH_CACHE_DIR = "/root/.cache/torch" +torch_cache_vol = modal.Volume.from_name("torch_cache", create_if_missing=True) + + +with img.imports(): + from queue import Queue + + import torch + from transformers.generation.streamers import TextStreamer + from transformers import AutoModel, AutoTokenizer + + MODEL_ID = os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR") + MODEL_PATH = os.path.join(HF_MODEL_DIR, MODEL_ID) + DEEPSEEK_ASSETS_DIR = os.path.join(ASSETS_DIR, "DeepSeek") + + # torch.set_float32_matmul_precision("high") + + +def print_model_params(model: torch.nn.Module, extra_info="", f=None): + # print the number of parameters in the model + model_million_params = sum(p.numel() for p in model.parameters()) / 1e6 + print(model, file=f) + print(f"{extra_info} {model_million_params} M parameters", file=f) + + +@app.function( + gpu=IMAGE_GPU, + cpu=2.0, + retries=0, + image=img, + secrets=[modal.Secret.from_name("achatbot")], + volumes={ + HF_MODEL_DIR: hf_model_vol, + ASSETS_DIR: assets_vol, + }, + timeout=1200, # default 300s + scaledown_window=1200, + max_containers=1, +) +async def run(func, **kwargs): + os.makedirs(DEEPSEEK_ASSETS_DIR, exist_ok=True) + subprocess.run("nvidia-smi --version", shell=True) + subprocess.run("nvcc --version", shell=True) + gpu_prop = None + if torch.cuda.is_available(): + gpu_prop = torch.cuda.get_device_properties("cuda") + print(gpu_prop) + + if asyncio.iscoroutinefunction(func): + await func(**kwargs) + else: + func(**kwargs) + + +def dump_model(**kwargs): + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + print("tokenizer.eos_token_id", tokenizer.eos_token_id) + print("Tokenizer:", tokenizer) + model = AutoModel.from_pretrained( + MODEL_PATH, + _attn_implementation="flash_attention_2", + trust_remote_code=True, + use_safetensors=True, + torch_dtype=torch.bfloat16, + device_map="cuda" if torch.cuda.is_available() else "auto", # need accelerate>=0.26.0 + ) + model = model.eval() + + print_model_params(model, extra_info="DeepSeek-OCR", f=sys.stdout) + + +def infer(**kwargs): + class NoEOSTextStreamer(TextStreamer): + def on_finalized_text(self, text: str, stream_end: bool = False): + stream_end and print("stream_end is True", flush=True) + eos_text = self.tokenizer.decode( + [self.tokenizer.eos_token_id], skip_special_tokens=False + ) + text = text.replace(eos_text, "\n") + print(text, flush=True, end="") + + model = AutoModel.from_pretrained( + MODEL_PATH, + _attn_implementation="flash_attention_2", + trust_remote_code=True, + use_safetensors=True, + ) + model = model.eval().cuda().to(torch.bfloat16) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + # prompt = "\nFree OCR. " + prompt = "\n<|grounding|>Convert the document to markdown. " + image_files = [ + "/DeepSeek-OCR/assets/fig1.png", + # use ORC detected Show pictures, detect again :) + # "/DeepSeek-OCR/assets/show1.jpg", + # "/DeepSeek-OCR/assets/show2.jpg", + # "/DeepSeek-OCR/assets/show3.jpg", + # "/DeepSeek-OCR/assets/show4.jpg", + ] + + # Tiny: base_size = 512, image_size = 512, crop_mode = False + # Small: base_size = 640, image_size = 640, crop_mode = False + # Base: base_size = 1024, image_size = 1024, crop_mode = False + # Large: base_size = 1280, image_size = 1280, crop_mode = False + # Gundam: base_size = 1024, image_size = 640, crop_mode = True # default + + for image_file in image_files: + print("infer image_file:", image_file) + # https://huggingface.co/deepseek-ai/DeepSeek-OCR/blob/main/modeling_deepseekocr.py#L703 + res = model.infer( + tokenizer, + prompt=prompt, + image_file=image_file, + output_path=DEEPSEEK_ASSETS_DIR, + base_size=1024, + image_size=640, + crop_mode=True, + save_results=True, + test_compress=True, + eval_mode=False, + streamer=NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False), + ) + print("infer result:", res) + + +""" +modal run src/download_models.py --repo-ids "deepseek-ai/DeepSeek-OCR" --revision "refs/pr/23" + +IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task dump_model +IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek..py --task infer +""" + + +@app.local_entrypoint() +def main(task: str = "dump_model"): + tasks = { + "dump_model": dump_model, + "infer": infer, + } + if task not in tasks: + raise ValueError(f"task {task} not found") + print(f"running task {task}") + run.remote( + tasks[task], + ) diff --git a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py new file mode 100644 index 00000000..26448fcb --- /dev/null +++ b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py @@ -0,0 +1,265 @@ +import os +import sys +import time +import asyncio +import subprocess +from pathlib import Path +from threading import Thread + + +import modal + +BACKEND = os.getenv("BACKEND", "") +APP_NAME = os.getenv("APP_NAME", "") +TP = os.getenv("TP", "1") +PROFILE_DIR = "/root/vllm_profile" + +app = modal.App("vllm-deepseek-ocr") +IMAGE_GPU = os.getenv("IMAGE_GPU", None) +vllm_image = ( + # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags + modal.Image.from_registry( + "nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04", + add_python="3.10", + ) + .apt_install("git", "git-lfs") + .run_commands( + "git clone https://github.com/deepseek-ai/DeepSeek-OCR.git", + "cd /DeepSeek-OCR && pip install -r requirements.txt", + ) + .pip_install( + "torch==2.6.0", + "torchvision==0.21.0", + "torchaudio==2.6.0", + "wheel", + "matplotlib", + "accelerate>=0.26.0", # for device_map="auto" model loading with safetensors slipt + ) + # .apt_install("clang", "cmake", "ninja-build") + # https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1 for torch 2.6.0 + .pip_install("flash-attn==2.7.4.post1", extra_options="--no-build-isolation") + .pip_install("vllm==v0.8.5", extra_index_url="https://download.pytorch.org/whl/cu126") + .env( + { + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "ACHATBOT_PKG": "1", + "CUDA_VISIBLE_DEVICES": "0", + "LLM_MODEL": os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR"), + "VLLM_USE_V1": "0", + "VLLM_TORCH_PROFILER_DIR": PROFILE_DIR, + "TP": TP, + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + "TORCH_CUDA_ARCH_LIST": "8.0 8.9 9.0+PTX", + } + ) +) + +if BACKEND == "flashinfer": + vllm_image = vllm_image.pip_install( + f"flashinfer-python==0.2.2.post1", # FlashInfer 0.2.3+ does not support per-request generators + extra_index_url="https://flashinfer.ai/whl/cu126/torch2.6", + ) + +if APP_NAME == "achatbot": + vllm_image = vllm_image.pip_install( + f"achatbot==0.0.28", + extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://pypi.org/simple/"), + ) + +# img = img.pip_install( +# f"achatbot==0.0.25.dev122", +# extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"), +# ) + + +HF_MODEL_DIR = "/root/.achatbot/models" +hf_model_vol = modal.Volume.from_name("models", create_if_missing=True) +ASSETS_DIR = "/root/.achatbot/assets" +assets_vol = modal.Volume.from_name("assets", create_if_missing=True) +CONFIG_DIR = "/root/.achatbot/config" +config_vol = modal.Volume.from_name("config", create_if_missing=True) + +VLLM_CACHE_DIR = "/root/.cache/vllm" +vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) +vllm_profile = modal.Volume.from_name("vllm_profile", create_if_missing=True) + + +with vllm_image.imports(): + sys.path.insert(0, "/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm") + + import torch + from tqdm import tqdm + from vllm import LLM, AsyncLLMEngine, AsyncEngineArgs, SamplingParams + + MODEL_ID = os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR") + MODEL_PATH = os.path.join(HF_MODEL_DIR, MODEL_ID) + DEEPSEEK_ASSETS_DIR = os.path.join(ASSETS_DIR, "DeepSeek-OCR-vllm") + + import config + + config.OUTPUT_PATH = DEEPSEEK_ASSETS_DIR + + from process.image_process import DeepseekOCRProcessor + from run_dpsk_ocr_image import load_image, re_match, process_image_with_refs, stream_generate + + # torch.set_float32_matmul_precision("high") + + +@app.function( + gpu=IMAGE_GPU, + cpu=2.0, + retries=0, + image=vllm_image, + secrets=[modal.Secret.from_name("achatbot")], + volumes={ + HF_MODEL_DIR: hf_model_vol, + ASSETS_DIR: assets_vol, + VLLM_CACHE_DIR: vllm_cache_vol, + }, + timeout=1200, # default 300s + scaledown_window=1200, + max_containers=1, +) +async def run(func, **kwargs): + os.makedirs(DEEPSEEK_ASSETS_DIR, exist_ok=True) + os.makedirs(f"{DEEPSEEK_ASSETS_DIR}/images", exist_ok=True) + subprocess.run("nvidia-smi --version", shell=True) + subprocess.run("nvcc --version", shell=True) + gpu_prop = None + if torch.cuda.is_available(): + gpu_prop = torch.cuda.get_device_properties("cuda") + print(gpu_prop) + + if asyncio.iscoroutinefunction(func): + await func(**kwargs) + else: + func(**kwargs) + + +async def stream_infer(**kwargs): + CROP_MODE = True + + image = load_image("/DeepSeek-OCR/assets/fig1.png").convert("RGB") + + PROMPT = "\n<|grounding|>Convert the document to markdown." + if "" in PROMPT: + image_features = DeepseekOCRProcessor().tokenize_with_images( + images=[image], bos=True, eos=True, cropping=CROP_MODE + ) + else: + image_features = "" + + prompt = PROMPT + + result_out = await stream_generate(image_features, prompt) + + save_results = 1 + + if save_results and "" in prompt: + print("=" * 15 + "save results:" + "=" * 15) + + image_draw = image.copy() + + outputs = result_out + + with open(f"{DEEPSEEK_ASSETS_DIR}/result_ori.mmd", "w", encoding="utf-8") as afile: + afile.write(outputs) + + matches_ref, matches_images, mathes_other = re_match(outputs) + # print(matches_ref) + # save images with boxes + result = process_image_with_refs(image_draw, matches_ref) + + for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")): + outputs = outputs.replace(a_match_image, f"![](images/" + str(idx) + ".jpg)\n") + + for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")): + outputs = ( + outputs.replace(a_match_other, "") + .replace("\\coloneqq", ":=") + .replace("\\eqqcolon", "=:") + ) + + # if 'structural formula' in conversation[0]['content']: + # outputs = '' + outputs + '' + with open(f"{DEEPSEEK_ASSETS_DIR}/result.mmd", "w", encoding="utf-8") as afile: + afile.write(outputs) + + if "line_type" in outputs: + import matplotlib.pyplot as plt + from matplotlib.patches import Circle + + lines = eval(outputs)["Line"]["line"] + + line_type = eval(outputs)["Line"]["line_type"] + # print(lines) + + endpoints = eval(outputs)["Line"]["line_endpoint"] + + fig, ax = plt.subplots(figsize=(3, 3), dpi=200) + ax.set_xlim(-15, 15) + ax.set_ylim(-15, 15) + + for idx, line in enumerate(lines): + try: + p0 = eval(line.split(" -- ")[0]) + p1 = eval(line.split(" -- ")[-1]) + + if line_type[idx] == "--": + ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color="k") + else: + ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color="k") + + ax.scatter(p0[0], p0[1], s=5, color="k") + ax.scatter(p1[0], p1[1], s=5, color="k") + except: + pass + + for endpoint in endpoints: + label = endpoint.split(": ")[0] + (x, y) = eval(endpoint.split(": ")[1]) + ax.annotate( + label, + (x, y), + xytext=(1, 1), + textcoords="offset points", + fontsize=5, + fontweight="light", + ) + + try: + if "Circle" in eval(outputs).keys(): + circle_centers = eval(outputs)["Circle"]["circle_center"] + radius = eval(outputs)["Circle"]["radius"] + + for center, r in zip(circle_centers, radius): + center = eval(center.split(": ")[1]) + circle = Circle( + center, radius=r, fill=False, edgecolor="black", linewidth=0.8 + ) + ax.add_patch(circle) + except Exception: + pass + + plt.savefig(f"{DEEPSEEK_ASSETS_DIR}/geo.jpg") + plt.close() + + result.save(f"{DEEPSEEK_ASSETS_DIR}/result_with_boxes.jpg") + + +""" +IMAGE_GPU=L40s modal run src/llm/vllm/vlm/ocr_deepseek.py --task stream_infer +""" + + +@app.local_entrypoint() +def main(task: str = "stream_infer"): + tasks = { + "stream_infer": stream_infer, + } + if task not in tasks: + raise ValueError(f"task {task} not found") + print(f"running task {task}") + run.remote( + tasks[task], + ) From 25c57b9854e72fe0a2f45548b68ca6edc52db892 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 17:31:21 +0800 Subject: [PATCH 2/9] add llm_transformers_manual_vision_deepseek_ocr and deploy deepseek_ocr vision ocr bot with fastapi_webrtc_vision_bot_serve Signed-off-by: weedge --- .../src/fastapi_webrtc_vision_bot_serve.py | 32 ++- .../src/llm/transformers/vlm/ocr_deepseek.py | 228 ++++++++++++++++-- pyproject.toml | 15 +- src/cmd/bots/vision/agora_ocr_vision_bot.py | 2 +- src/cmd/bots/vision/daily_ocr_vision_bot.py | 2 +- src/cmd/bots/vision/livekit_ocr_vision_bot.py | 2 +- src/common/interface.py | 10 +- src/core/llm/__init__.py | 3 + .../manual_vision_ocr_deepseek.py | 182 ++++++++++++++ src/modules/vision/ocr/__init__.py | 6 + .../vision/ocr/test_transformers_got.py | 2 +- 11 files changed, 447 insertions(+), 37 deletions(-) create mode 100644 src/core/llm/transformers/manual_vision_ocr_deepseek.py diff --git a/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py b/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py index 192e4839..997972bc 100644 --- a/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py +++ b/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py @@ -315,6 +315,22 @@ class ContainerRuntimeConfig: } ) ), + "deepseek_ocr": ( + vision_bot_img.pip_install( + [ + f"achatbot[llm_transformers_manual_vision_deepseek_ocr]=={achatbot_version}", + ], + extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://pypi.org/simple/"), + ) + .pip_install("flash-attn==2.7.4.post1", extra_options="--no-build-isolation") + .pip_install("onnxruntime") + .env( + { + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "LLM_MODEL_NAME_OR_PATH": f"/root/.achatbot/models/{os.getenv('LLM_MODEL_NAME_OR_PATH', 'deepseek-ai/DeepSeek-OCR')}", + } + ) + ), } @staticmethod @@ -461,14 +477,24 @@ def app(self): modal serve src/fastapi_webrtc_vision_bot_serve.py -# put config +# vllm_skyworkr1v single room bot modal volume put config ./config/bots/daily_describe_vllm_skyworkr1v_vision_bot.json /bots/ -f - -# run fastdeploy ernie4v bot to join room EXTRA_INDEX_URL=https://pypi.org/simple/ \ SERVE_TYPE=room_bot \ CONFIG_FILE=/root/.achatbot/config/bots/daily_describe_vllm_skyworkr1v_vision_bot.json \ ACHATBOT_VERSION=0.0.21.post2 \ IMAGE_NAME=vllm_skyworkr1v IMAGE_CONCURRENT_CN=1 IMAGE_GPU=L40s:4 \ modal serve src/fastapi_webrtc_vision_bot_serve.py + + +# deepseek single room bot +modal run src/download_models.py --repo-ids "FunAudioLLM/SenseVoiceSmall" +modal run src/download_models.py --repo-ids "deepseek-ai/DeepSeek-OCR" --revision "refs/pr/23" +modal volume put config ./config/bots/daily_ocr_vision_bot.json /bots/ -f +EXTRA_INDEX_URL=https://pypi.org/simple/ \ + SERVE_TYPE=room_bot \ + CONFIG_FILE=/root/.achatbot/config/bots/daily_ocr_vision_bot.json \ + ACHATBOT_VERSION=0.0.28 \ + IMAGE_NAME=deepseek_ocr IMAGE_CONCURRENT_CN=1 IMAGE_GPU=L4 \ + modal serve src/fastapi_webrtc_vision_bot_serve.py """ diff --git a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py index 4f9c08b3..80efd69c 100644 --- a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py +++ b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py @@ -1,9 +1,11 @@ +import io import os import sys +import uuid import asyncio import subprocess -from pathlib import Path from threading import Thread +from PIL import Image import modal @@ -11,6 +13,7 @@ app = modal.App("deepseek-ocr") IMAGE_GPU = os.getenv("IMAGE_GPU", None) +BACKEND = os.getenv("BACKEND", None) img = ( # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags modal.Image.from_registry( @@ -43,10 +46,11 @@ ) ) -# img = img.pip_install( -# f"achatbot==0.0.25.dev122", -# extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"), -# ) +if BACKEND == "achatbot": + img = img.pip_install( + f"achatbot==0.0.27.dev6", + extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"), + ) HF_MODEL_DIR = "/root/.achatbot/models" @@ -64,7 +68,7 @@ from queue import Queue import torch - from transformers.generation.streamers import TextStreamer + from transformers.generation.streamers import TextStreamer, TextIteratorStreamer from transformers import AutoModel, AutoTokenizer MODEL_ID = os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR") @@ -130,12 +134,12 @@ def dump_model(**kwargs): def infer(**kwargs): class NoEOSTextStreamer(TextStreamer): def on_finalized_text(self, text: str, stream_end: bool = False): - stream_end and print("stream_end is True", flush=True) eos_text = self.tokenizer.decode( [self.tokenizer.eos_token_id], skip_special_tokens=False ) text = text.replace(eos_text, "\n") - print(text, flush=True, end="") + # print(text, flush=True, end="") + stream_end and print("stream_end is True", flush=True) model = AutoModel.from_pretrained( MODEL_PATH, @@ -146,12 +150,118 @@ def on_finalized_text(self, text: str, stream_end: bool = False): model = model.eval().cuda().to(torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + prompt = "\n<|grounding|>Convert the document to markdown. " # prompt = "\nFree OCR. " + # document: \n<|grounding|>Convert the document to markdown. + # other image: \n<|grounding|>OCR this image. + # without layouts: \nFree OCR. + # figures in document: \nParse the figure. + # general: \nDescribe this image in detail. + # rec: \nLocate <|ref|>xxxx<|/ref|> in the image. + # '先天下之忧而忧' + # ....... + + # create dummy image file + dummy_img = Image.new("RGB", (640, 640), color="white") + ioBuff = io.BytesIO() + dummy_img.save(ioBuff, format="PNG") + ioBuff.seek(0) + + image_files = [ + dummy_img, + ioBuff, + "/DeepSeek-OCR/assets/fig1.png", + # use ORC detected Show pictures, detect again :) + "/DeepSeek-OCR/assets/show1.jpg", + # "/DeepSeek-OCR/assets/show2.jpg", + # "/DeepSeek-OCR/assets/show3.jpg", + # "/DeepSeek-OCR/assets/show4.jpg", + ] + + # Tiny: base_size = 512, image_size = 512, crop_mode = False + # Small: base_size = 640, image_size = 640, crop_mode = False + # Base: base_size = 1024, image_size = 1024, crop_mode = False + # Large: base_size = 1280, image_size = 1280, crop_mode = False + # Gundam: base_size = 1024, image_size = 640, crop_mode = True # default + + for image_file in image_files: + print("infer image_file:", image_file) + streamer = TextIteratorStreamer( + tokenizer, + skip_prompt=True, + # skip_special_tokens=True, + skip_special_tokens=False, + ) + # https://huggingface.co/deepseek-ai/DeepSeek-OCR/blob/refs%2Fpr%2F23/modeling_deepseekocr.py#L707 + gen_kwargs = dict( + tokenizer=tokenizer, + prompt=prompt, + image_file=image_file, + # output_path=DEEPSEEK_ASSETS_DIR, + base_size=1024, + image_size=640, + crop_mode=True, + save_results=False, # open save results, u can push to S3 or other storage for cdn :) + test_compress=False, + eval_mode=False, + # streamer=NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False), + streamer=streamer, + verbose=False, + ) + thread = Thread(target=model.infer, kwargs=gen_kwargs) + thread.start() + for new_text in streamer: + if new_text is not None: + print(new_text, flush=True, end="") # note: flush with buff to print + + +@torch.inference_mode() +def infer_filter(**kwargs): + class NoEOSTextStreamer(TextStreamer): + def on_finalized_text(self, text: str, stream_end: bool = False): + eos_text = self.tokenizer.decode( + [self.tokenizer.eos_token_id], skip_special_tokens=False + ) + text = text.replace(eos_text, "\n") + # print(text, flush=True, end="") + stream_end and print("stream_end is True", flush=True) + + model = AutoModel.from_pretrained( + MODEL_PATH, + _attn_implementation="flash_attention_2", + trust_remote_code=True, + use_safetensors=True, + # torch_dtype=torch.bfloat16, + # device_map="cuda" if torch.cuda.is_available() else "auto", # need accelerate>=0.26.0 + ) + model = model.eval().cuda().to(torch.bfloat16) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + # NOTE: use FIED task prompt prompt = "\n<|grounding|>Convert the document to markdown. " + # prompt = "\n<|grounding|>Convert the document to text. " # have issue + # prompt = "\nFree OCR. " + # document: \n<|grounding|>Convert the document to markdown. + # other image: \n<|grounding|>OCR this image. + # without layouts: \nFree OCR. + # figures in document: \nParse the figure. + # general: \nDescribe this image in detail. + # rec: \nLocate <|ref|>xxxx<|/ref|> in the image. + # '先天下之忧而忧' + # ....... + + # create dummy image file + dummy_img = Image.new("RGB", (640, 640), color="white") + ioBuff = io.BytesIO() + dummy_img.save(ioBuff, format="PNG") + ioBuff.seek(0) + image_files = [ + dummy_img, + ioBuff, "/DeepSeek-OCR/assets/fig1.png", # use ORC detected Show pictures, detect again :) - # "/DeepSeek-OCR/assets/show1.jpg", + "/DeepSeek-OCR/assets/show1.jpg", # "/DeepSeek-OCR/assets/show2.jpg", # "/DeepSeek-OCR/assets/show3.jpg", # "/DeepSeek-OCR/assets/show4.jpg", @@ -165,28 +275,110 @@ def on_finalized_text(self, text: str, stream_end: bool = False): for image_file in image_files: print("infer image_file:", image_file) - # https://huggingface.co/deepseek-ai/DeepSeek-OCR/blob/main/modeling_deepseekocr.py#L703 - res = model.infer( + streamer = TextIteratorStreamer( tokenizer, + skip_prompt=True, + # skip_special_tokens=True, + skip_special_tokens=False, + ) + # https://huggingface.co/deepseek-ai/DeepSeek-OCR/blob/refs%2Fpr%2F23/modeling_deepseekocr.py#L707 + gen_kwargs = dict( + tokenizer=tokenizer, prompt=prompt, image_file=image_file, - output_path=DEEPSEEK_ASSETS_DIR, + # output_path=DEEPSEEK_ASSETS_DIR, base_size=1024, image_size=640, crop_mode=True, - save_results=True, - test_compress=True, + save_results=False, # open save results, u can push to S3 or other storage for cdn :) + test_compress=False, eval_mode=False, - streamer=NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False), + # streamer=NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False), + streamer=streamer, + verbose=False, + ) + thread = Thread(target=model.infer, kwargs=gen_kwargs) + thread.start() + is_ref_det = False + det_text = "" + for new_text in streamer: + if new_text is None: + continue + print(new_text, flush=True, end="") # note: flush with buff to print + if "<|ref|>" in new_text: + is_ref_det = True + + if "<|/det|>" in new_text: + is_ref_det = False + new_text = new_text.split("<|/det|>")[1] + + if "<|end▁of▁sentence|>" in new_text: + if "<|/ref|>" not in new_text: + new_text = new_text.split("<|end▁of▁sentence|>")[0] + print("\n" + 20 * "----") + if "<|end▁of▁sentence|>" in new_text: + if "<|/ref|>" not in new_text: + new_text = new_text.split("<|end▁of▁sentence|>")[0] + print("\n" + 20 * "----") + + if is_ref_det is False: + det_text += new_text + print("det_text:", det_text) + torch.cuda.empty_cache() + + +async def achatbot_infer(**kwargs): + from achatbot.processors.vision.ocr_processor import OCRProcessor + from achatbot.modules.vision.ocr import VisionOCREnvInit + from achatbot.common.session import SessionCtx, Session + from achatbot.types.frames.data_frames import UserImageRawFrame + from achatbot.common.logger import Logger + + Logger.init(os.getenv("LOG_LEVEL", "info").upper(), is_file=False, is_console=True) + + ocr = VisionOCREnvInit.initVisionOCREngine( + "llm_transformers_manual_vision_deepseek_ocr", + { + "lm_model_name_or_path": MODEL_PATH, + "lm_device": "cuda", + "ocr_base_size": 1024, + "ocr_image_size": 640, + "ocr_crop_mode": True, + "ocr_prompt": "\n<|grounding|>Convert the document to markdown. ", + }, + ) + session = Session(**SessionCtx(str(uuid.uuid4())).__dict__) + processor = OCRProcessor(ocr=ocr, session=session) + image_files = [ + Image.new("RGB", (640, 640), color="white"), + Image.open("/DeepSeek-OCR/assets/fig1.png"), + # use ORC detected Show pictures, detect again :) + Image.open("/DeepSeek-OCR/assets/show1.jpg"), + # Image.open("/DeepSeek-OCR/assets/show2.jpg"), + # Image.open("/DeepSeek-OCR/assets/show3.jpg"), + # Image.open("/DeepSeek-OCR/assets/show4.jpg"), + ] + for image_obj in image_files: + image_obj: Image.Image = image_obj + frame = UserImageRawFrame( + image=image_obj.tobytes(), + size=image_obj.size, + format=image_obj.format, # from frame bytes, no save format, need add a save format e.g.: JPEG,PNG, + mode=image_obj.mode, # default: RGB + user_id=session.ctx.client_id, ) - print("infer result:", res) + iter = processor.run_detect(frame) + async for textFrame in iter: + print(textFrame) """ modal run src/download_models.py --repo-ids "deepseek-ai/DeepSeek-OCR" --revision "refs/pr/23" IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task dump_model -IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek..py --task infer +IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task infer +IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task infer_filter +BACKEND=achatbot IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task achatbot_infer """ @@ -195,6 +387,8 @@ def main(task: str = "dump_model"): tasks = { "dump_model": dump_model, "infer": infer, + "infer_filter": infer_filter, + "achatbot_infer": achatbot_infer, } if task not in tasks: raise ValueError(f"task {task} not found") diff --git a/pyproject.toml b/pyproject.toml index 0f3c0002..ae710555 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ local_scheme = "no-local-version" [project] name = "achatbot" #dynamic = ["version"] -version = "0.0.27.post0" +version = "0.0.28" authors = [{ name = "weedge", email = "weege007@gmail.com" }] maintainers = [{ name = "weedge", email = "weege007@gmail.com" }] description = "An open source chat bot for voice (and multimodal) assistants" @@ -113,7 +113,7 @@ einops = ["einops~=0.8.0"] flash-attn = ["flash-attn==2.7.3"] tiktoken = ["tiktoken~=0.7.0"] verovio = ["verovio~=4.3.1"] -accelerate = ["accelerate~=1.7.0"] +accelerate = ["accelerate>=0.26.0"] opencv = ["opencv-python~=4.10.0.84"] librosa = ["librosa~=0.10.2.post1"] soundfile = ["soundfile~=0.12.1"] @@ -203,7 +203,7 @@ speech_waker = ["achatbot[porcupine_wakeword]"] # vad module tag -> pkgs pyannote_vad = ["pyannote.audio~=3.2.0"] webrtcvad = ["webrtcvad~=2.0.10"] -silero_vad = ["achatbot[torch_vision_audio]"] +silero_vad = ["achatbot[torch_vision_audio]", "onnxruntime",] webrtc_silero_vad = ["achatbot[webrtcvad,silero_vad]"] speech_vad = ["achatbot[pyannote_vad,webrtcvad,silero_vad]"] @@ -321,6 +321,14 @@ llm_transformers_manual_vision_skyworkr1v = [ "achatbot[llm_transformers_manual_vision]", "timm", ] +llm_transformers_manual_vision_deepseek_ocr = [ + "achatbot[llm_transformers_manual_vision,einops,matplotlib,accelerate]", + "easydict", + "addict", + "torch==2.6.0", + "torchvision==0.21.0", + "torchaudio==2.6.0", +] # vision + audio -> text llm_transformers_manual_vision_speech_phi = [ @@ -441,7 +449,6 @@ llm_transformers_manual_voice_vita = [ "WeTextProcessing", "inflect", "openai-whisper", - "onnxruntime", "modelscope", "word2number", "pyworld", diff --git a/src/cmd/bots/vision/agora_ocr_vision_bot.py b/src/cmd/bots/vision/agora_ocr_vision_bot.py index 834cad45..12389fb9 100644 --- a/src/cmd/bots/vision/agora_ocr_vision_bot.py +++ b/src/cmd/bots/vision/agora_ocr_vision_bot.py @@ -68,7 +68,7 @@ async def on_first_participant_joined( [ transport.input_processor(), asr_processor, - image_requester, + image_requester,# send a request for video_source to capture a picture ocr_processor, tts_processor, transport.output_processor(), diff --git a/src/cmd/bots/vision/daily_ocr_vision_bot.py b/src/cmd/bots/vision/daily_ocr_vision_bot.py index b21664bb..23f5d106 100644 --- a/src/cmd/bots/vision/daily_ocr_vision_bot.py +++ b/src/cmd/bots/vision/daily_ocr_vision_bot.py @@ -57,7 +57,7 @@ async def arun(self): [ transport.input_processor(), self.asr_processor, - self.image_requester, + self.image_requester, # send a request for video_source to capture a picture self.ocr_processor, self.tts_processor, transport.output_processor(), diff --git a/src/cmd/bots/vision/livekit_ocr_vision_bot.py b/src/cmd/bots/vision/livekit_ocr_vision_bot.py index 0162fefb..6c22adcd 100644 --- a/src/cmd/bots/vision/livekit_ocr_vision_bot.py +++ b/src/cmd/bots/vision/livekit_ocr_vision_bot.py @@ -77,7 +77,7 @@ async def on_video_track_subscribed( transport.input_processor(), asr_processor, image_requester, - ocr_processor, + ocr_processor, # send a request for video_source to capture a picture tts_processor, transport.output_processor(), ] diff --git a/src/common/interface.py b/src/common/interface.py index 66403ac1..381c3707 100644 --- a/src/common/interface.py +++ b/src/common/interface.py @@ -463,7 +463,7 @@ def annotate(self, session) -> Generator[Any, None, None]: class IVisionOCR(ABC): @abstractmethod - def generate(self, session) -> Iterator[str]: + def generate(self, session, **kwargs) -> Iterator[str | dict | np.ndarray]: """ input: session.ctx.state["ocr_img"] detect object and generate text @@ -471,14 +471,6 @@ def generate(self, session) -> Iterator[str]: """ raise NotImplementedError("must be implemented in the child class") - def stream_infer(self, session) -> Iterator[str]: - """ - input: session.ctx.state["ocr_img"] - detect object and generate text - return iterator next token (str) - """ - raise NotImplementedError("must be implemented in the child class") - class IRoomManager(ABC): @abstractmethod diff --git a/src/core/llm/__init__.py b/src/core/llm/__init__.py index a033b487..46d7e8f2 100644 --- a/src/core/llm/__init__.py +++ b/src/core/llm/__init__.py @@ -134,6 +134,8 @@ def getEngine(tag, **kwargs) -> interface.ILlmGenerator | interface.ILlm | Engin from .transformers import manual_vision_ernie4v elif "llm_transformers_manual_vision_skyworkr1v" in tag: from .transformers import manual_vision_skyworkr1v + elif "llm_transformers_manual_vision_deepseek_ocr" in tag: + from .transformers import manual_vision_ocr_deepseek elif "llm_transformers_manual_voice_step2" in tag: from .transformers import manual_voice_step2 elif "llm_transformers_manual" == tag: @@ -599,6 +601,7 @@ def get_vita_audio_transformers_args() -> dict: "llm_transformers_manual_vision_glm4v": get_llm_transformers_args, "llm_transformers_manual_vision_ernie4v": get_llm_transformers_args, "llm_transformers_manual_vision_skyworkr1v": get_llm_transformers_args, + "llm_transformers_manual_vision_deepseek_ocr": get_llm_transformers_args, "llm_transformers_manual_vision_gemma3": get_llm_transformers_args, "llm_transformers_manual_vision_speech_gemma3n": get_llm_transformers_args, "llm_transformers_manual_vision_gemma3n": get_llm_transformers_args, diff --git a/src/core/llm/transformers/manual_vision_ocr_deepseek.py b/src/core/llm/transformers/manual_vision_ocr_deepseek.py new file mode 100644 index 00000000..15d41b46 --- /dev/null +++ b/src/core/llm/transformers/manual_vision_ocr_deepseek.py @@ -0,0 +1,182 @@ +import io +import logging +from threading import Thread +from PIL import Image +from time import perf_counter + +try: + from transformers import AutoTokenizer, TextIteratorStreamer, AutoModel + import torch + +except ModuleNotFoundError as e: + logging.error(f"Exception: {e}") + logging.error( + "In order to use Smol-VLM, you need to `pip install achatbot[llm_transformers_manual_vision_deepseek_ocr]`" + ) + raise Exception(f"Missing module: {e}") + + +from src.common.utils.helper import get_device, print_model_params +from src.common.interface import IVisionOCR +from src.common.random import set_all_random_seed +from src.common.session import Session +from src.types.speech.language import TO_LLM_LANGUAGE +from src.types.llm.transformers import TransformersLMArgs +from .base import TransformersBaseLLM + + +class TransformersManualVisionDeepSeekOCR(TransformersBaseLLM, IVisionOCR): + TAG = "llm_transformers_manual_vision_deepseek_ocr" + + def __init__(self, tokenizer=None, **args) -> None: + self.base_size = args.pop("ocr_base_size", 1024) + self.image_size = args.pop("ocr_image_size", 640) + self.crop_mode = args.pop("ocr_crop_mode", True) + # Tiny: base_size = 512, image_size = 512, crop_mode = False + # Small: base_size = 640, image_size = 640, crop_mode = False + # Base: base_size = 1024, image_size = 1024, crop_mode = False + # Large: base_size = 1280, image_size = 1280, crop_mode = False + # Gundam: base_size = 1024, image_size = 640, crop_mode = True # default + + self.prompt = args.pop( + "ocr_prompt", "\n<|grounding|>Convert the document to markdown. " + ) + # document: \n<|grounding|>Convert the document to markdown. + # other image: \n<|grounding|>OCR this image. + # without layouts: \nFree OCR. + # figures in document: \nParse the figure. + # general: \nDescribe this image in detail. + # rec: \nLocate <|ref|>xxxx<|/ref|> in the image. + + self.args = TransformersLMArgs(**args) + gpu_prop = torch.cuda.get_device_properties("cuda") + + if self.args.lm_device_map: + self._model = AutoModel.from_pretrained( + self.args.lm_model_name_or_path, + torch_dtype=torch.bfloat16, + #!NOTE: https://github.com/huggingface/transformers/issues/20896 + # device_map for multi cpu/gpu with accelerate + device_map=self.args.lm_device_map, + attn_implementation="flash_attention_2" + if gpu_prop and gpu_prop.major >= 8 + else None, + trust_remote_code=True, + ).eval() + else: + self._model = ( + AutoModel.from_pretrained( + self.args.lm_model_name_or_path, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2" + if gpu_prop and gpu_prop.major >= 8 + else None, + trust_remote_code=True, + ) + .eval() + .to(self.args.lm_device) + ) + + logging.info(f"TransformersLMArgs: {self.args}") + print_model_params(self._model, self.TAG) + self._tokenizer = tokenizer or AutoTokenizer.from_pretrained( + self.args.lm_model_name_or_path, use_fast=True + ) + self.warmup() + + def warmup(self): + if self.args.warmup_steps <= 0: + return + + # create dummy image file + dummy_img = Image.new("RGB", (640, 640), color="white") + ioBuff = io.BytesIO() + dummy_img.save(ioBuff, format="PNG") + ioBuff.seek(0) + + streamer = TextIteratorStreamer( + self._tokenizer, skip_prompt=True, skip_special_tokens=False + ) + + warmup_gen_kwargs = dict( + tokenizer=self._tokenizer, + prompt=self.prompt, + image_file=ioBuff, + base_size=self.base_size, + image_size=self.image_size, + crop_mode=self.crop_mode, + save_results=False, + test_compress=False, + eval_mode=False, + verbose=False, + streamer=streamer, + ) + + self._warmup( + target=self._model.infer, + kwargs=warmup_gen_kwargs, + streamer=streamer, + ) + + def set_task_prompt(self, prompt: str): + self.prompt = prompt + + @torch.inference_mode() + def generate(self, session: Session, **kwargs): + seed = kwargs.get("seed", self.args.lm_gen_seed) + set_all_random_seed(seed) + + ocr_img = session.ctx.state["ocr_img"] + streamer = TextIteratorStreamer( + self._tokenizer, skip_prompt=True, skip_special_tokens=False + ) + generation_kwargs = dict( + tokenizer=self._tokenizer, + prompt=self.prompt, + image_file=ocr_img, + base_size=self.base_size, + image_size=self.image_size, + crop_mode=self.crop_mode, + save_results=False, + test_compress=False, + eval_mode=False, + verbose=False, + streamer=streamer, + ) + thread = Thread(target=self._model.infer, kwargs=generation_kwargs) + thread.start() + + generated_text = "" + start = perf_counter() + times = [] + is_ref_det = False + sentence = "" + # TODO: extract detect image to storage with s3 (use callback) + for new_text in streamer: + times.append(perf_counter() - start) + generated_text += new_text + if "<|ref|>" in new_text: + is_ref_det = True + + if "<|/det|>" in new_text: + is_ref_det = False + new_text = new_text.split("<|/det|>")[1] + + if "<|end▁of▁sentence|>" in new_text: + if "<|/ref|>" not in new_text: + new_text = new_text.split("<|end▁of▁sentence|>")[0] + if "<|end▁of▁sentence|>" in new_text: + if "<|/ref|>" not in new_text: + new_text = new_text.split("<|end▁of▁sentence|>")[0] + + if is_ref_det is False: + sentence += new_text + pos = self._have_special_char(sentence) + if pos > -1: + yield sentence[: pos + 1] + sentence = sentence[pos + 1 :] + start = perf_counter() + if len(sentence) > 0: + yield sentence + "." + logging.info(f"{generated_text=} TTFT: {times[0]:.4f}s total time: {sum(times):.4f}s") + torch.cuda.empty_cache() diff --git a/src/modules/vision/ocr/__init__.py b/src/modules/vision/ocr/__init__.py index 033011ec..ee1231bb 100644 --- a/src/modules/vision/ocr/__init__.py +++ b/src/modules/vision/ocr/__init__.py @@ -1,6 +1,7 @@ import logging import os +from src.core.llm import LLMEnvInit from src.common import interface from src.common.factory import EngineClass, EngineFactory from src.common.types import MODELS_DIR @@ -16,6 +17,10 @@ class VisionOCREnvInit: def getEngine(tag, **kwargs) -> interface.IVisionOCR | EngineClass: if "vision_transformers_got_ocr" in tag: from . import transformers_got + elif ( + "llm_transformers_manual_vision_deepseek_ocr" in tag + ): # modules/vision/ocr dep core/llm/transformers :) + from src.core.llm.transformers import manual_vision_ocr_deepseek engine = EngineFactory.get_engine_by_tag(EngineClass, tag, **kwargs) return engine @@ -54,4 +59,5 @@ def get_transformers_got_ocr_args() -> dict: # TAG : config map_config_func = { "vision_transformers_got_ocr": get_transformers_got_ocr_args, + "llm_transformers_manual_vision_deepseek_ocr": LLMEnvInit.get_llm_transformers_args, } diff --git a/test/modules/vision/ocr/test_transformers_got.py b/test/modules/vision/ocr/test_transformers_got.py index f8a7d56b..de10a298 100644 --- a/test/modules/vision/ocr/test_transformers_got.py +++ b/test/modules/vision/ocr/test_transformers_got.py @@ -48,7 +48,7 @@ def test_ocr_generate(self): for image in image_cases: with self.subTest(image=image): self.session.ctx.state["ocr_img"] = image - iter = self.engine.stream_infer(self.session) + iter = self.engine.generate(self.session) generated_text = "" times = [] From 57f7e0edf146e557e23fb62e7b42c0c5611d7746 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 18:02:14 +0800 Subject: [PATCH 3/9] Update deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py index 26448fcb..9c240385 100644 --- a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py +++ b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py @@ -165,7 +165,7 @@ async def stream_infer(**kwargs): with open(f"{DEEPSEEK_ASSETS_DIR}/result_ori.mmd", "w", encoding="utf-8") as afile: afile.write(outputs) - matches_ref, matches_images, mathes_other = re_match(outputs) + matches_ref, matches_images, matches_other = re_match(outputs) # print(matches_ref) # save images with boxes result = process_image_with_refs(image_draw, matches_ref) From 391ffc2fe15ab1e6b786df4ee486196997b1efa1 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 18:02:59 +0800 Subject: [PATCH 4/9] Update deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py index 9c240385..2aeade0b 100644 --- a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py +++ b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py @@ -173,7 +173,7 @@ async def stream_infer(**kwargs): for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")): outputs = outputs.replace(a_match_image, f"![](images/" + str(idx) + ".jpg)\n") - for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")): + for idx, a_match_other in enumerate(tqdm(matches_other, desc="other")): outputs = ( outputs.replace(a_match_other, "") .replace("\\coloneqq", ":=") From 402b67b6cfff3fd560bbcf2d6a1ba6958ba93101 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 18:03:12 +0800 Subject: [PATCH 5/9] Update deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py index 2aeade0b..39c869b8 100644 --- a/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py +++ b/deploy/modal/src/llm/vllm/vlm/ocr_deepseek.py @@ -122,7 +122,7 @@ ) async def run(func, **kwargs): os.makedirs(DEEPSEEK_ASSETS_DIR, exist_ok=True) - os.makedirs(f"{DEEPSEEK_ASSETS_DIR}/images", exist_ok=True) + os.makedirs(os.path.join(DEEPSEEK_ASSETS_DIR, "images"), exist_ok=True) subprocess.run("nvidia-smi --version", shell=True) subprocess.run("nvcc --version", shell=True) gpu_prop = None From 946467debc374626e9a828d7bee0a38cfb22f1a1 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 18:03:25 +0800 Subject: [PATCH 6/9] Update deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py index 80efd69c..8862b35f 100644 --- a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py +++ b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py @@ -363,7 +363,7 @@ async def achatbot_infer(**kwargs): frame = UserImageRawFrame( image=image_obj.tobytes(), size=image_obj.size, - format=image_obj.format, # from frame bytes, no save format, need add a save format e.g.: JPEG,PNG, + format=image_obj.format or "PNG", # from frame bytes, no save format, need add a save format e.g.: JPEG,PNG, mode=image_obj.mode, # default: RGB user_id=session.ctx.client_id, ) From 682b05d9d4c69c6c35579116c7aae3db9daa4e6b Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 18:34:54 +0800 Subject: [PATCH 7/9] Update src/core/llm/transformers/manual_vision_ocr_deepseek.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/core/llm/transformers/manual_vision_ocr_deepseek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/llm/transformers/manual_vision_ocr_deepseek.py b/src/core/llm/transformers/manual_vision_ocr_deepseek.py index 15d41b46..dcfabeca 100644 --- a/src/core/llm/transformers/manual_vision_ocr_deepseek.py +++ b/src/core/llm/transformers/manual_vision_ocr_deepseek.py @@ -49,7 +49,7 @@ def __init__(self, tokenizer=None, **args) -> None: # rec: \nLocate <|ref|>xxxx<|/ref|> in the image. self.args = TransformersLMArgs(**args) - gpu_prop = torch.cuda.get_device_properties("cuda") + gpu_prop = torch.cuda.get_device_properties("cuda") if torch.cuda.is_available() else None if self.args.lm_device_map: self._model = AutoModel.from_pretrained( From 43e4c6477f354a0aac73817a07b621b4bd9e6162 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 18:35:17 +0800 Subject: [PATCH 8/9] Update src/core/llm/transformers/manual_vision_ocr_deepseek.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/core/llm/transformers/manual_vision_ocr_deepseek.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/llm/transformers/manual_vision_ocr_deepseek.py b/src/core/llm/transformers/manual_vision_ocr_deepseek.py index dcfabeca..2ed213ed 100644 --- a/src/core/llm/transformers/manual_vision_ocr_deepseek.py +++ b/src/core/llm/transformers/manual_vision_ocr_deepseek.py @@ -178,5 +178,6 @@ def generate(self, session: Session, **kwargs): start = perf_counter() if len(sentence) > 0: yield sentence + "." - logging.info(f"{generated_text=} TTFT: {times[0]:.4f}s total time: {sum(times):.4f}s") + if times: + logging.info(f"{generated_text=} TTFT: {times[0]:.4f}s total time: {sum(times):.4f}s") torch.cuda.empty_cache() From 50555dabc601b16bc9a014ad88a2ea896cdea718 Mon Sep 17 00:00:00 2001 From: weedge Date: Wed, 22 Oct 2025 22:46:34 +0800 Subject: [PATCH 9/9] fix test Signed-off-by: weedge --- deploy/modal/src/fastapi_webrtc_vision_bot_serve.py | 5 ++--- deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py | 4 ++-- pyproject.toml | 2 ++ src/modules/speech/tts/edge_tts.py | 9 +++++---- test/modules/speech/tts/test_kokoro.py | 2 +- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py b/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py index 997972bc..228640e6 100644 --- a/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py +++ b/deploy/modal/src/fastapi_webrtc_vision_bot_serve.py @@ -323,7 +323,6 @@ class ContainerRuntimeConfig: extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://pypi.org/simple/"), ) .pip_install("flash-attn==2.7.4.post1", extra_options="--no-build-isolation") - .pip_install("onnxruntime") .env( { "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", @@ -380,8 +379,8 @@ def get_allow_concurrent_inputs(): ) # img = img.pip_install( -# f"achatbot==0.0.21.post3", -# extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://pypi.org/simple/"), +# f"achatbot==0.0.27.dev11", +# extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"), # ) HF_MODEL_DIR = "/root/.achatbot/models" diff --git a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py index 8862b35f..25f9ec10 100644 --- a/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py +++ b/deploy/modal/src/llm/transformers/vlm/ocr_deepseek.py @@ -48,7 +48,7 @@ if BACKEND == "achatbot": img = img.pip_install( - f"achatbot==0.0.27.dev6", + f"achatbot==0.0.27.dev8", extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"), ) @@ -363,7 +363,7 @@ async def achatbot_infer(**kwargs): frame = UserImageRawFrame( image=image_obj.tobytes(), size=image_obj.size, - format=image_obj.format or "PNG", # from frame bytes, no save format, need add a save format e.g.: JPEG,PNG, + format=image_obj.format, # from frame bytes, no save format, need add a save format e.g.: JPEG,PNG, mode=image_obj.mode, # default: RGB user_id=session.ctx.client_id, ) diff --git a/pyproject.toml b/pyproject.toml index ae710555..ff164aad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -328,6 +328,8 @@ llm_transformers_manual_vision_deepseek_ocr = [ "torch==2.6.0", "torchvision==0.21.0", "torchaudio==2.6.0", + "transformers==4.46.3", + "tokenizers==0.20.3", ] # vision + audio -> text diff --git a/src/modules/speech/tts/edge_tts.py b/src/modules/speech/tts/edge_tts.py index f1c8bff7..c49b4b40 100644 --- a/src/modules/speech/tts/edge_tts.py +++ b/src/modules/speech/tts/edge_tts.py @@ -43,11 +43,10 @@ async def _inference( ) self.args.voice_name = random.choice(voices)["ShotName"] self.voice_name = self.args.voice_name - logging.info(f"{self.TAG} voice: {self.voice_name}") - communicate: edge_tts.Communicate = edge_tts.Communicate( - text, - self.args.voice_name, + args = dict( + text=text, + voice=self.args.voice_name, rate=self.args.rate, volume=self.args.volume, pitch=self.args.pitch, @@ -55,6 +54,8 @@ async def _inference( connect_timeout=self.args.connect_timeout, receive_timeout=self.args.receive_timeout, ) + logging.info(f"{self.TAG} voice: {self.voice_name} args: {args}") + communicate: edge_tts.Communicate = edge_tts.Communicate(**args) self.submaker = edge_tts.SubMaker() # "outputFormat":"audio-24khz-48kbitrate-mono-mp3" diff --git a/test/modules/speech/tts/test_kokoro.py b/test/modules/speech/tts/test_kokoro.py index 4130a42a..ebcaa0e0 100644 --- a/test/modules/speech/tts/test_kokoro.py +++ b/test/modules/speech/tts/test_kokoro.py @@ -72,7 +72,7 @@ def test_synthesize(self): output=True, ) - self.test_set_voice() + # self.test_set_voice() self.session.ctx.state["tts_text"] = self.tts_text print(self.session.ctx)