From 3794599a219fd482ac307dc2d58bf48764890623 Mon Sep 17 00:00:00 2001 From: weedge Date: Thu, 7 Aug 2025 17:43:20 +0800 Subject: [PATCH 1/3] add vllm + openai_gpt_oss on modal to run Signed-off-by: weedge --- deploy/modal/src/llm/vllm/openai_gpt_oss.py | 823 ++++++++++++++++++++ 1 file changed, 823 insertions(+) create mode 100644 deploy/modal/src/llm/vllm/openai_gpt_oss.py diff --git a/deploy/modal/src/llm/vllm/openai_gpt_oss.py b/deploy/modal/src/llm/vllm/openai_gpt_oss.py new file mode 100644 index 00000000..9f70fad5 --- /dev/null +++ b/deploy/modal/src/llm/vllm/openai_gpt_oss.py @@ -0,0 +1,823 @@ +import datetime +import os +from pathlib import Path +import sys +import asyncio +import subprocess +import threading +import uuid +import json +from time import perf_counter + + +import modal + + +app = modal.App("openai_gpt_oss") +IMAGE_GPU = os.getenv("IMAGE_GPU", None) +BACKEND = os.getenv("BACKEND", "") +APP_NAME = os.getenv("APP_NAME", "") +TP = os.getenv("TP", "1") + +img = ( + # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags + modal.Image.from_registry( + "nvcr.io/nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04", + add_python="3.12", + ) + .apt_install("git", "git-lfs") + .run_commands( + "uv pip install --system --pre vllm==0.10.1+gptoss " + "--extra-index-url https://wheels.vllm.ai/gpt-oss/ " + "--extra-index-url https://download.pytorch.org/whl/nightly/cu128 " + "--index-strategy unsafe-best-match", + ) + .pip_install("openai-harmony") + .env( + { + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + # "TQDM_DISABLE": "1", + "LLM_MODEL": os.getenv("LLM_MODEL", "openai/gpt-oss-20b"), + "VLLM_USE_V1": "1", + "TP": TP, + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + "TORCH_CUDA_ARCH_LIST": "8.0 8.9 9.0+PTX", + } + ) + .run_commands("git clone https://github.com/weedge/gpt-oss.git") +) +if BACKEND == "flashinfer": + img = img.pip_install( + f"flashinfer-python", + extra_index_url="https://wheels.vllm.ai/gpt-oss/", + ) + +# if IMAGE_GPU in ["H100", "B100", "H200", "B200"]: +# img = img.pip_install("triton==3.4.0") +# else: +# img = img.pip_install("triton==3.3.1") + + +HF_MODEL_DIR = "/root/.achatbot/models" +hf_model_vol = modal.Volume.from_name("models", create_if_missing=True) +VLL_CACHE_DIR = "/root/.cache/vllm" +vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) + + +with img.imports(): + import termcolor + import torch + from vllm import LLM, LLMEngine, EngineArgs, SamplingParams, TokensPrompt + from openai_harmony import ( + HarmonyEncodingName, + load_harmony_encoding, + Conversation, + Message, + Role, + SystemContent, + DeveloperContent, + ReasoningEffort, + StreamableParser, + StreamState, + TextContent, + ToolDescription, + Author, + ) + + sys.path.insert(0, "/gpt-oss") + from gpt_oss.tools import apply_patch + from gpt_oss.tools.simple_browser import SimpleBrowserTool + from gpt_oss.tools.simple_browser.backend import ExaBackend + from gpt_oss.tools.python_docker.docker_tool import PythonTool + + REASONING_EFFORT = { + "high": ReasoningEffort.HIGH, + "medium": ReasoningEffort.MEDIUM, + "low": ReasoningEffort.LOW, + } + + MODEL_PATH = os.getenv("LLM_MODEL", "openai/gpt-oss-20b") + model_path = os.path.join(HF_MODEL_DIR, MODEL_PATH) + + class TokenGenerator: + def __init__( + self, + model_path: str, + tensor_parallel_size: int = 1, + gpu_memory_utilization: float = 0.7, + ): + args = EngineArgs( + model=model_path, + tensor_parallel_size=tensor_parallel_size, + gpu_memory_utilization=gpu_memory_utilization, + # quantization="mxfp4", + ) + self.engine = LLMEngine.from_engine_args(args) + self.request_id = 0 + + def generate( + self, + prompt_tokens: list[int], + stop_tokens: list[int] | None = None, + temperature: float = 1.0, + max_tokens: int = 0, + return_logprobs: bool = False, + ): + if max_tokens == 0: + max_tokens = None + request_id = str(self.request_id) + self.request_id += 1 + sampling_params = SamplingParams( + temperature=temperature, + max_tokens=max_tokens, + stop_token_ids=stop_tokens, + logprobs=0 if return_logprobs else None, + ) + prompt = TokensPrompt(prompt_token_ids=prompt_tokens) + self.engine.add_request(request_id, prompt, sampling_params) + last_token_id = [] + while self.engine.has_unfinished_requests(): + step_outputs = self.engine.step() + output = step_outputs[0].outputs[0] + token_ids = output.token_ids + logprobs_list = output.logprobs if hasattr(output, "logprobs") else None + new_token_ids = token_ids[len(last_token_id) :] + new_logprobs = ( + logprobs_list[len(last_token_id) :] + if logprobs_list is not None + else [None] * len(new_token_ids) + ) + for token_id, logprobs in zip(new_token_ids, new_logprobs): + last_token_id.append(token_id) + if return_logprobs: + logprob_val = None + if logprobs is not None and token_id in logprobs: + logprob_val = logprobs[token_id].logprob + yield (token_id, logprob_val) + else: + yield token_id + if stop_tokens is not None and token_id in stop_tokens: + break + + +@app.function( + gpu=IMAGE_GPU, + cpu=2.0, + retries=0, + image=img, + volumes={ + HF_MODEL_DIR: hf_model_vol, + VLL_CACHE_DIR: vllm_cache_vol, + }, + timeout=1200, # default 300s + scaledown_window=1200, + max_containers=1, +) +async def run(func, **kwargs): + subprocess.run("nvidia-smi --version", shell=True) + subprocess.run("nvcc --version", shell=True) + if torch.cuda.is_available(): + gpu_prop = torch.cuda.get_device_properties("cuda") + print(gpu_prop) + assert gpu_prop.major > 8, f"now vllm gpt oss model just support cuda arch 9.0+ :)" + + if asyncio.iscoroutinefunction(func): + await func(**kwargs) + else: + func(**kwargs) + + +def get_tokenizer(): + import tiktoken + + o200k_base = tiktoken.get_encoding("o200k_base") + tokenizer = tiktoken.Encoding( + name="o200k_harmony", + pat_str=o200k_base._pat_str, + mergeable_ranks=o200k_base._mergeable_ranks, + special_tokens={ + **o200k_base._special_tokens, + "<|startoftext|>": 199998, + "<|endoftext|>": 199999, + "<|reserved_200000|>": 200000, + "<|reserved_200001|>": 200001, + "<|return|>": 200002, + "<|constrain|>": 200003, + "<|reserved_200004|>": 200004, + "<|channel|>": 200005, + "<|start|>": 200006, + "<|end|>": 200007, + "<|message|>": 200008, + "<|reserved_200009|>": 200009, + "<|reserved_200010|>": 200010, + "<|reserved_200011|>": 200011, + "<|call|>": 200012, + } + | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}, + ) + return tokenizer + + +def tokenizer(**kwargs): + tokenizer = get_tokenizer() + stop_ids = tokenizer.encode("<|endoftext|>", allowed_special="all") + print(stop_ids) + assert tokenizer.eot_token == stop_ids[0] + + prompt = kwargs.get("prompt", "什么是快乐星球?") + token_ids = tokenizer.encode(prompt) + print(f"promt_token_ids: {token_ids}") + tokens = tokenizer.decode(token_ids) + print(f"promt_tokens: {tokens}") + + +def harmony_chat_tokenizer(**kwargs): + tokenizer = get_tokenizer() + + # --- 1) Render the prefill with Harmony --- + encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + + reasoning_effort = kwargs.get("reasoning", "medium") + model_identity = kwargs.get( + "model_identity", "You are ChatGPT, a large language model trained by OpenAI." + ) + convo = Conversation.from_messages( + [ + Message.from_role_and_content( + Role.SYSTEM, + SystemContent.new() + .with_reasoning_effort(REASONING_EFFORT[reasoning_effort]) + .with_model_identity(model_identity) + .with_conversation_start_date(datetime.datetime.now().strftime("%Y-%m-%d")), + ), + Message.from_role_and_content( + Role.DEVELOPER, + DeveloperContent.new().with_instructions("Always respond in riddles"), + ), + Message.from_role_and_content(Role.USER, "What is the weather like in SF?"), + ] + ) + + prompt_token_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT) + + # Harmony stop tokens (pass to sampler so they won't be included in output) + stop_token_ids = encoding.stop_tokens_for_assistant_actions() + + prompt_token = tokenizer.decode(prompt_token_ids) + print(prompt_token_ids, prompt_token) + + print("\n" + "----" * 20 + "\n") + + stop_tokens = tokenizer.decode(stop_token_ids) + print(stop_tokens, stop_token_ids) + + +def generate(**kwargs): + # --- 1) Render the prefill with Harmony --- + encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + + convo = Conversation.from_messages( + [ + Message.from_role_and_content(Role.SYSTEM, SystemContent.new()), + Message.from_role_and_content( + Role.DEVELOPER, + DeveloperContent.new().with_instructions("Always respond in riddles"), + ), + Message.from_role_and_content(Role.USER, "What is the weather like in SF?"), + ] + ) + + prefill_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT) + + # Harmony stop tokens (pass to sampler so they won't be included in output) + stop_token_ids = encoding.stop_tokens_for_assistant_actions() + print(stop_token_ids) + + # --- 2) Run vLLM with prefill --- + llm = LLM( + model=model_path, + tensor_parallel_size=int(os.getenv("TP", "1")), + trust_remote_code=True, + gpu_memory_utilization=0.7, + quantization="mxfp4", + ) + + sampling = SamplingParams( + temperature=0.6, + top_p=0.95, + max_tokens=128, + repetition_penalty=1.1, + stop_token_ids=stop_token_ids, + ) + for i in range(2): + start = perf_counter() + outputs = llm.generate( + prompt_token_ids=[prefill_ids], # batch of size 1 + sampling_params=sampling, + ) + print(f"{i} generate cost:{(perf_counter() - start):.3f} s") + print(outputs) + + # vLLM gives you both text and token IDs + gen = outputs[0].outputs[0] + text = gen.text + print(f"generate text: {text}") + output_tokens = gen.token_ids # <-- these are the completion token IDs (no prefill) + + # --- 3) Parse the completion token IDs back into structured Harmony messages --- + entries = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT) + + # 'entries' is a sequence of structured conversation entries (assistant messages, tool calls, etc.). + for message in entries: + print(json.dumps(message.to_dict(), indent=2)) + + +def generate_stream(**kwargs): + """ + no chat instruct, generate token step by step with tiktoken(o200k_harmony) encode/decode + """ + generator = TokenGenerator( + model_path=model_path, tensor_parallel_size=int(os.getenv("TP", "1")) + ) + tokenizer = get_tokenizer() + + temperature = kwargs.get("temperature", 0.6) + max_tokens = kwargs.get("max_tokens", 128) + + for i in range(3): # compile, need warmup + prompt = kwargs.get("prompt", "什么是快乐星球?") + token_ids = tokenizer.encode(prompt) + print(f"promt_token_ids: {token_ids}") + + times = [] + start = perf_counter() + text = "" + for token_id, logprob in generator.generate( + token_ids, + stop_tokens=[tokenizer.eot_token], + temperature=temperature, + max_tokens=max_tokens, + return_logprobs=True, + ): + token_ids.append(token_id) + decoded_token = tokenizer.decode( + [token_id] + ) # NOTE: unicode decode need stream parser, need to stop + times.append(perf_counter() - start) + text += decoded_token + print(f"Generated token: {repr(decoded_token)}, logprob: {logprob}") + start = perf_counter() + + print(f"{i} generated Text: {text}") + print(f"{i} TTFT: {times[0]:.4f}s total time: {sum(times):.4f}s") + + +def chat_stream(**kwargs): + """ + use harmony chat format to generate token step by step + """ + + # --- 1) Render the prefill with Harmony --- + encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + + reasoning_effort = kwargs.get("reasoning", "medium") + model_identity = kwargs.get( + "model_identity", "You are ChatGPT, a large language model trained by OpenAI." + ) + convo = Conversation.from_messages( + [ + Message.from_role_and_content( + Role.SYSTEM, + SystemContent.new() + .with_reasoning_effort(REASONING_EFFORT[reasoning_effort]) + .with_model_identity(model_identity) + .with_conversation_start_date(datetime.datetime.now().strftime("%Y-%m-%d")), + ), + Message.from_role_and_content( + Role.DEVELOPER, + DeveloperContent.new().with_instructions("Always respond in riddles"), + ), + Message.from_role_and_content(Role.USER, "What is the weather like in SF?"), + ] + ) + + prompt_token_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT) + + # Harmony stop tokens (pass to sampler so they won't be included in output) + stop_token_ids = encoding.stop_tokens_for_assistant_actions() + + parser = StreamableParser(encoding, role=Role.ASSISTANT) + generator = TokenGenerator( + model_path=model_path, tensor_parallel_size=int(os.getenv("TP", "1")) + ) + tokenizer = get_tokenizer() + prompt_token = tokenizer.decode(prompt_token_ids) + print(f"{prompt_token=}") + + field_created = False + current_output_text = "" + output_text_delta_buffer = "" + for predicted_token in generator.generate(prompt_token_ids, stop_tokens=stop_token_ids): + parser.process(predicted_token) + + if parser.state == StreamState.EXPECT_START: + print("") # new line + field_created = False + + if not parser.last_content_delta: + continue + + if not field_created: + field_created = True + if parser.current_channel == "final": + print(termcolor.colored("Assistant:", "green"), flush=True) + elif parser.current_recipient is not None: + print( + termcolor.colored(f"Tool call to {parser.current_recipient}:", "cyan"), + flush=True, + ) + else: + print(termcolor.colored("CoT:", "yellow"), flush=True) + + should_send_output_text_delta = True + output_text_delta_buffer += parser.last_content_delta + if should_send_output_text_delta: + print(output_text_delta_buffer, end="", flush=True) + current_output_text += output_text_delta_buffer + output_text_delta_buffer = "" + print(f"{current_output_text=}") + + print(f"{parser.messages=}") + + +async def get_user_input(q: modal.Queue): + rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0 + if rank == 0: + # user_input = input() + user_input = await q.get.aio(partition="text") + else: + user_input = "" + user_input_list = [user_input] + if torch.distributed.is_initialized(): + torch.distributed.broadcast_object_list(user_input_list, 0) + return user_input_list[0] + + +async def chat_tool_stream(**kwargs): + generator = TokenGenerator( + model_path=model_path, tensor_parallel_size=int(os.getenv("TP", "1")) + ) + + encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + + reasoning_effort = kwargs.get("reasoning", "medium") + model_identity = kwargs.get( + "model_identity", "You are ChatGPT, a large language model trained by OpenAI." + ) + system_message_content = ( + SystemContent.new() + .with_reasoning_effort(REASONING_EFFORT[reasoning_effort]) + .with_model_identity(model_identity) + .with_conversation_start_date(datetime.datetime.now().strftime("%Y-%m-%d")) + ) + + build_in_tool = kwargs.get("build_in_tool", "browser") + + if build_in_tool == "browser": + backend = ExaBackend( + source="web", + ) + browser_tool = SimpleBrowserTool(backend=backend) + system_message_content = system_message_content.with_tools(browser_tool.tool_config) + + if build_in_tool == "python": + python_tool = PythonTool() + system_message_content = system_message_content.with_tools(python_tool.tool_config) + + system_message = Message.from_role_and_content(Role.SYSTEM, system_message_content) + messages = [system_message] + + is_apply_patch = kwargs.get("is_apply_patch", None) + developer_message = kwargs.get("developer_message", "") + if is_apply_patch: + apply_patch_instructions = Path(apply_patch.__file__).parent / "apply_patch.md" + developer_message = developer_message + "\n" if developer_message else "" + developer_message += apply_patch_instructions.read_text() + developer_message_content = ( + DeveloperContent.new() + .with_instructions(developer_message) + .with_function_tools( + [ + ToolDescription.new( + "apply_patch", + "Patch a file", + parameters={ + "type": "string", + "description": "Formatted patch code", + "default": "*** Begin Patch\n*** End Patch\n", + }, + ), + ] + ) + ) + messages.append(Message.from_role_and_content(Role.DEVELOPER, developer_message_content)) + else: + developer_message_content = DeveloperContent.new().with_instructions(developer_message) + messages.append(Message.from_role_and_content(Role.DEVELOPER, developer_message_content)) + + raw = kwargs.get("raw", None) + if raw: + conversation = Conversation.from_messages(messages) + tokens = encoding.render_conversation(conversation) + system_message = encoding.decode(tokens) + print(system_message, flush=True, end="") + empty_user_message_tokens = encoding.render(Message.from_role_and_content(Role.USER, "")) + user_message_start = encoding.decode(empty_user_message_tokens[:-1]) + user_message_end = encoding.decode(empty_user_message_tokens[-1:]) + else: + # System message + print(termcolor.colored("System Message:", "cyan"), flush=True) + print( + termcolor.colored("Model Identity:", "cyan"), + system_message_content.model_identity, + flush=True, + ) + print( + termcolor.colored("Reasoning Effort:", "cyan"), + system_message_content.reasoning_effort, + flush=True, + ) + print( + termcolor.colored("Conversation Start Date:", "cyan"), + system_message_content.conversation_start_date, + flush=True, + ) + print( + termcolor.colored("Knowledge Cutoff:", "cyan"), + system_message_content.knowledge_cutoff, + flush=True, + ) + print( + termcolor.colored("Browser Tool:", "cyan"), + "Enabled" if build_in_tool == "browser" else "Disabled", + flush=True, + ) + print( + termcolor.colored("Python Tool:", "cyan"), + "Enabled" if build_in_tool == "python" else "Disabled", + flush=True, + ) + print( + termcolor.colored("Apply Patch Function:", "cyan"), + "Enabled" if is_apply_patch else "Disabled", + flush=True, + ) + # Developer message + print(termcolor.colored("Developer Message:", "yellow"), flush=True) + print(developer_message_content.instructions, flush=True) + + # Print the system message and the user message start + MESSAGE_PADDING = 12 + q = kwargs.get("q", None) + while True: + last_message = messages[-1] + if last_message.recipient is None: + if raw: + print(user_message_start, end="", flush=True) + user_message = await get_user_input(q) + print(user_message_end, flush=True, end="") + else: + print(termcolor.colored("User:".ljust(MESSAGE_PADDING), "red"), flush=True) + user_message = await get_user_input(q) + user_message = Message.from_role_and_content(Role.USER, user_message) + messages.append(user_message) + else: + # Tool or function call + if last_message.recipient.startswith("browser."): + assert build_in_tool == "browser", "Browser tool is not enabled" + tool_name = "Search" + + async def run_tool(): + results = [] + async for msg in browser_tool.process(last_message): + results.append(msg) + return results + + result = await run_tool() + messages += result + elif last_message.recipient.startswith("python"): + assert build_in_tool == "python", "Python tool is not enabled" + tool_name = "Python" + + async def run_tool(): + results = [] + async for msg in python_tool.process(last_message): + results.append(msg) + return results + + result = await run_tool() + messages += result + elif last_message.recipient == "functions.apply_patch": + assert is_apply_patch, "Apply patch tool is not enabled" + tool_name = "Apply Patch" + text = last_message.content[0].text + tool_output = None + + if text.startswith("{"): + # this is json, try to extract the patch from it + import json + + try: + some_dict = json.loads(text) + _, text = some_dict.popitem() + except Exception as e: + tool_output = f"Error parsing JSON: {e}" + + if tool_output is None: + try: + tool_output = apply_patch.apply_patch(text) + except Exception as e: + tool_output = f"Error applying patch: {e}" + + message = Message( + author=Author.new(Role.TOOL, last_message.recipient), + content=[TextContent(text=tool_output)], + ).with_recipient("assistant") + if last_message.channel: + message = message.with_channel(last_message.channel) + + result = [message] + messages += result + else: + raise ValueError(f"Unknown tool or function call: {last_message.recipient}") + # Print the tool or function call result + if raw: + rendered_result = encoding.render_conversation(Conversation.from_messages(result)) + print(encoding.decode(rendered_result), flush=True, end="") + else: + print( + termcolor.colored(f"{tool_name} output:".ljust(MESSAGE_PADDING), "magenta"), + flush=True, + ) + show_browser_results = kwargs.get("show_browser_results", None) + if tool_name == "Search" and not show_browser_results: + print("[Search results fed to the model]") + else: + print(result[0].content[0].text) + + conversation = Conversation.from_messages(messages) + tokens = encoding.render_conversation_for_completion(conversation, Role.ASSISTANT) + + if raw: + # Print the last two tokens, which are the start of the assistant message + print(encoding.decode(tokens[-2:]), flush=True, end="") + + parser = StreamableParser(encoding, role=Role.ASSISTANT) + field_created = False + current_output_text = "" + output_text_delta_buffer = "" + for predicted_token in generator.generate( + tokens, encoding.stop_tokens_for_assistant_actions() + ): + parser.process(predicted_token) + if raw: + print(encoding.decode([predicted_token]), end="", flush=True) + continue + + if parser.state == StreamState.EXPECT_START: + print("") # new line + field_created = False + + if not parser.last_content_delta: + continue + + if not field_created: + field_created = True + if parser.current_channel == "final": + print(termcolor.colored("Assistant:", "green"), flush=True) + elif parser.current_recipient is not None: + print( + termcolor.colored(f"Tool call to {parser.current_recipient}:", "cyan"), + flush=True, + ) + else: + print(termcolor.colored("CoT:", "yellow"), flush=True) + + should_send_output_text_delta = True + output_text_delta_buffer += parser.last_content_delta + if build_in_tool == "browser": + updated_output_text, _annotations, has_partial_citations = ( + browser_tool.normalize_citations(current_output_text + output_text_delta_buffer) + ) + output_text_delta_buffer = updated_output_text[len(current_output_text) :] + if has_partial_citations: + should_send_output_text_delta = False + if should_send_output_text_delta: + print(output_text_delta_buffer, end="", flush=True) + current_output_text += output_text_delta_buffer + output_text_delta_buffer = "" + + messages += parser.messages + + +""" +# download hf tranformers weight(safetensors) for vllm to load +modal run src/download_models.py --repo-ids "openai/gpt-oss-20b" +modal run src/download_models.py --repo-ids "openai/gpt-oss-120b" + +# see help +modal run src/llm/vllm/openai_gpt_oss.py --help + + +# tokenizer +modal run src/llm/vllm/openai_gpt_oss.py --task tokenizer +modal run src/llm/vllm/openai_gpt_oss.py --task harmony_chat_tokenizer + +# generate +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task generate + +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task generate_stream + +# chat +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_stream + +# local input --- queue --> remote to loop chat +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --show-browser-results +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --is-apply-patch +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --raw +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python --is-apply-patch +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python --raw +""" + + +def run_async_in_thread(coroutine, **kwargs): + async def wrapper(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + await coroutine(**kwargs) + finally: + loop.close() + + thread = threading.Thread(target=asyncio.run, args=(wrapper(),)) + thread.start() + return thread + + +async def loop_input(q: modal.Queue): + while True: + text = input() + print(f"local input: {text}") + await q.put.aio(text, partition="text") + + +@app.local_entrypoint() +def main( + task: str = "tokenizer", + reasoning: str = "medium", # low medium high + model_identity: str = "You are ChatGPT, a large language model trained by OpenAI.", + prompt: str = "什么是快乐星球?", + temperature: float = 1.0, + top_p: float = 1.0, + max_tokens: int = 128, + # for chat_tool_stream + build_in_tool: str = "browser", # build-in tools: browser,python + is_apply_patch: bool = False, # Make apply_patch tool available to the model (default: False) + show_browser_results: bool = False, # Show browser results (default: False) + developer_message: str = "", # Developer message (default: ) + raw: bool = False, # Raw mode (does not render Harmony encoding) (default: False) +): + print(task) + tasks = { + "tokenizer": tokenizer, + "harmony_chat_tokenizer": harmony_chat_tokenizer, + "generate": generate, + "generate_stream": generate_stream, + "chat_stream": chat_stream, + "chat_tool_stream": chat_tool_stream, + } + if task not in tasks: + raise ValueError(f"task {task} not found") + print(f"running task {task}") + with modal.Queue.ephemeral() as q: + if task == "chat_tool_stream": + run_async_in_thread(loop_input, q=q) + + run.remote( + tasks[task], + reasoning=reasoning.lower(), + model_identity=model_identity, + prompt=prompt, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + build_in_tool=build_in_tool, + is_apply_patch=is_apply_patch, + show_browser_results=show_browser_results, + developer_message=developer_message, + raw=raw, + q=q, + ) From 790b0e71c81c55eccdb52f9d4fc427bd167164d2 Mon Sep 17 00:00:00 2001 From: weedge Date: Thu, 7 Aug 2025 19:06:16 +0800 Subject: [PATCH 2/3] local input --- queue --> remote to loop chat Signed-off-by: weedge --- .env.example | 1 + deploy/modal/src/llm/vllm/openai_gpt_oss.py | 41 ++++++++++++++++----- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/.env.example b/.env.example index 44faab7b..22386f4e 100644 --- a/.env.example +++ b/.env.example @@ -56,6 +56,7 @@ SEARCH1_API_KEY= SERPER_API_KEY= OPENWEATHERMAP_API_KEY= AMAP_MAPS_API_KEY= +EXA_API_KEY= # turn server diff --git a/deploy/modal/src/llm/vllm/openai_gpt_oss.py b/deploy/modal/src/llm/vllm/openai_gpt_oss.py index 9f70fad5..2c89fb2a 100644 --- a/deploy/modal/src/llm/vllm/openai_gpt_oss.py +++ b/deploy/modal/src/llm/vllm/openai_gpt_oss.py @@ -120,6 +120,7 @@ def generate( prompt_tokens: list[int], stop_tokens: list[int] | None = None, temperature: float = 1.0, + top_p: float = 1.0, max_tokens: int = 0, return_logprobs: bool = False, ): @@ -129,6 +130,7 @@ def generate( self.request_id += 1 sampling_params = SamplingParams( temperature=temperature, + top_p=top_p, max_tokens=max_tokens, stop_token_ids=stop_tokens, logprobs=0 if return_logprobs else None, @@ -169,6 +171,7 @@ def generate( HF_MODEL_DIR: hf_model_vol, VLL_CACHE_DIR: vllm_cache_vol, }, + secrets=[modal.Secret.from_name("achatbot")], timeout=1200, # default 300s scaledown_window=1200, max_containers=1, @@ -341,8 +344,9 @@ def generate_stream(**kwargs): ) tokenizer = get_tokenizer() - temperature = kwargs.get("temperature", 0.6) + temperature = kwargs.get("temperature", 1.0) max_tokens = kwargs.get("max_tokens", 128) + top_p = kwargs.get("top_p", 1.0) for i in range(3): # compile, need warmup prompt = kwargs.get("prompt", "什么是快乐星球?") @@ -356,6 +360,7 @@ def generate_stream(**kwargs): token_ids, stop_tokens=[tokenizer.eot_token], temperature=temperature, + top_p=top_p, max_tokens=max_tokens, return_logprobs=True, ): @@ -417,7 +422,13 @@ def chat_stream(**kwargs): field_created = False current_output_text = "" output_text_delta_buffer = "" - for predicted_token in generator.generate(prompt_token_ids, stop_tokens=stop_token_ids): + for predicted_token in generator.generate( + prompt_token_ids, + stop_tokens=stop_token_ids, + temperature=kwargs.get("temperature", 1.0), + top_p=kwargs.get("top_p", 1.0), + max_tokens=kwargs.get("max_tokens", 128), + ): parser.process(predicted_token) if parser.state == StreamState.EXPECT_START: @@ -649,7 +660,10 @@ async def run_tool(): result = [message] messages += result else: - raise ValueError(f"Unknown tool or function call: {last_message.recipient}") + msg = f"Unknown tool or function call: {last_message=}" + print(msg) + # raise ValueError(f"Unknown tool or function call: {last_message.recipient}") + # Print the tool or function call result if raw: rendered_result = encoding.render_conversation(Conversation.from_messages(result)) @@ -677,7 +691,11 @@ async def run_tool(): current_output_text = "" output_text_delta_buffer = "" for predicted_token in generator.generate( - tokens, encoding.stop_tokens_for_assistant_actions() + tokens, + stop_tokens=encoding.stop_tokens_for_assistant_actions(), + temperature=kwargs.get("temperature", 1.0), + top_p=kwargs.get("top_p", 1.0), + max_tokens=kwargs.get("max_tokens", 2048), ): parser.process(predicted_token) if raw: @@ -742,14 +760,19 @@ async def run_tool(): IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_stream # local input --- queue --> remote to loop chat + +## use browser tool(find,open,search), need env EXA_API_KEY from https://exa.ai IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser -IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --show-browser-results -IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --is-apply-patch -IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --raw -IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream \ + --max-tokens 2048 --temperature=1.0 --top-p=1.0 \ + --build-in-tool browser --show-browser-results --model-identity "你是一名聊天助手,请用中文回复。" +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --is-apply-patch --show-browser-results +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool browser --raw --is-apply-patch --show-browser-results + +## need python tool to run script need change python docker, u can change python tools to do local env python or use serverless function IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python --is-apply-patch -IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python --raw +IMAGE_GPU=H100 modal run src/llm/vllm/openai_gpt_oss.py --task chat_tool_stream --build-in-tool python --raw --is-apply-patch """ From 5c5d90b83a3077f1c7148b82aac9a9cfacb345d6 Mon Sep 17 00:00:00 2001 From: weedge Date: Thu, 7 Aug 2025 21:23:06 +0800 Subject: [PATCH 3/3] change review code Signed-off-by: weedge --- deploy/modal/src/llm/vllm/openai_gpt_oss.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/deploy/modal/src/llm/vllm/openai_gpt_oss.py b/deploy/modal/src/llm/vllm/openai_gpt_oss.py index 2c89fb2a..fb20c8c3 100644 --- a/deploy/modal/src/llm/vllm/openai_gpt_oss.py +++ b/deploy/modal/src/llm/vllm/openai_gpt_oss.py @@ -636,8 +636,6 @@ async def run_tool(): if text.startswith("{"): # this is json, try to extract the patch from it - import json - try: some_dict = json.loads(text) _, text = some_dict.popitem() @@ -777,15 +775,10 @@ async def run_tool(): def run_async_in_thread(coroutine, **kwargs): - async def wrapper(): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - await coroutine(**kwargs) - finally: - loop.close() - - thread = threading.Thread(target=asyncio.run, args=(wrapper(),)) + def run_coro_in_thread(): + asyncio.run(coroutine(**kwargs)) + + thread = threading.Thread(target=run_coro_in_thread) thread.start() return thread