Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lightllm/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
InternVLDeepSeek2TpPartModel,
)
from lightllm.models.internvl.model import InternVLInternlm2TpPartModel
from lightllm.models.interns1.model import InternS1Qwen3MOETpPartModel
from lightllm.models.qwen2_vl.model import Qwen2VLTpPartModel
from lightllm.models.qwen2_reward.model import Qwen2RewardTpPartModel
from lightllm.models.gemma3.model import Gemma3TpPartModel
Expand Down
639 changes: 639 additions & 0 deletions lightllm/models/interns1/interns1_visual.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import torch
import numpy as np
from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight



# add key: model.language_model.xxx -> model.xxx
# only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now
def rename_weight_keys(weights):
prefix = "model.language_model."
keys = list(weights.keys())
for k in keys:
if prefix in k:
weights["model." + k[len(prefix) :]] = weights[k]


class InternS1PreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
def __init__(self, data_type, network_config, mode):
super().__init__(data_type, network_config, mode)
return

def load_hf_weights(self, weights):
rename_weight_keys(weights)
super().load_hf_weights(weights)
return



127 changes: 127 additions & 0 deletions lightllm/models/interns1/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import os
import json
from lightllm.models.registry import ModelRegistry, llm_model_type_is
from lightllm.common.basemodel.multimodal_tokenizer import BaseMultiModalTokenizer
from lightllm.common.build_utils import repair_config
from lightllm.server.core.objs import SamplingParams
from lightllm.server.multimodal_params import AudioItem, MultimodalParams, ImageItem
from lightllm.models.qwen3_moe.model import Qwen3MOEModel
from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
from lightllm.models.interns1.layer_weights.pre_and_post_layer_weight import (
InternS1PreAndPostLayerWeight,
)


IMG_START_TOKEN = "<img>"
IMG_END_TOKEN = "</img>"
IMG_TOKEN = "<IMG_CONTEXT>"



# Warp of the origal tokenizer
class InternS1Tokenizer(BaseMultiModalTokenizer):
def __init__(self, tokenizer, model_cfg, **kwargs):
super().__init__(tokenizer)
self.llm_model_type = model_cfg.get("text_config").get("model_type")
self.image_length = int(os.environ.get("INTERNVL_IMAGE_LENGTH", 256))

self.image_start_tag = IMG_START_TOKEN
self.image_start_id = tokenizer.convert_tokens_to_ids(self.image_start_tag)

self.image_end_tag = IMG_END_TOKEN
self.image_end_id = tokenizer.convert_tokens_to_ids(self.image_end_tag)


def init_imageitem_extral_params(
self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
):
img.extra_params["image_patch_max_num"] = 12 # 好丑的写法,后面改动

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The value 12 for image_patch_max_num is hardcoded. The comment # 好丑的写法,后面改动 (ugly way of writing, change later) indicates this is a temporary solution. This value should be made configurable, for example by reading it from the model configuration in the __init__ method and storing it as an instance attribute.

return

def init_audioitem_extral_params(
self, audio: AudioItem, multi_params: MultimodalParams, sampling_params: SamplingParams
):
return

def get_image_token_length(self, img: ImageItem):
return self.image_length

def get_audio_token_length(self, audio: AudioItem):
return

# only change the impl of the encode func:
def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
# TEXT<IMG_CONTEXT>TEXT<IMG_CONTEXT>TEXT --> TEXT<img></img>TEXT<img></img>TEXT
image_tokens = IMG_START_TOKEN + IMG_END_TOKEN
if multimodal_params is None:
add_special_tokens = kwargs.get("add_special_tokens", True)
return self.tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
image_count = len(multimodal_params.images)
prompt = prompt.replace(IMG_TOKEN, image_tokens, image_count)
origin_ids = self.tokenizer.encode(prompt, add_special_tokens=kwargs["add_special_tokens"])

# print("[debug] prompt: ", prompt)
# print("[debug] origin_ids: ", origin_ids)
# import copy
# origin_ids_ = copy.deepcopy(origin_ids)

Comment on lines +63 to +67

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Leftover debug code (e.g., print statements, import copy) should be removed before merging to keep the codebase clean.

# <img></img> --> <img>id,id+1...id+num</img>
input_ids = []
image_id = 0
start_idx = 0
while True:
try:
start_idx = origin_ids.index(self.image_start_id, start_idx)
if start_idx + 1 >= len(origin_ids):
break
if origin_ids[start_idx + 1] == self.image_end_id:
input_ids.extend(origin_ids[: start_idx + 1])
token_id = multimodal_params.images[image_id].token_id
token_num = multimodal_params.images[image_id].token_num
input_ids.extend(range(token_id, token_id + token_num))
input_ids.append(self.image_end_id)
origin_ids = origin_ids[start_idx + 2 :]
start_idx = 0
image_id += 1
else:
raise ValueError("image token error")
except ValueError:
break
input_ids.extend(origin_ids[start_idx:])

# print("[debug] input_ids: ", input_ids)
# data = {
# "origin_ids": origin_ids_,
# "input_ids": input_ids
# }
# with open("input_ids_lightllm.json", "w") as f:
# json.dump(data, f)
Comment on lines +92 to +98

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Leftover debug code (e.g., print statements, json.dump) should be removed before merging to keep the codebase clean.


return input_ids



@ModelRegistry(["interns1"], is_multimodal=True, condition=llm_model_type_is("qwen3_moe"))
class InternS1Qwen3MOETpPartModel(Qwen3MOEModel):
# weight class
pre_and_post_weight_class = InternS1PreAndPostLayerWeight

# infer class
pre_layer_infer_class = LlamaMultimodalPreLayerInfer

def __init__(self, kvargs):
super().__init__(kvargs)
return

def _init_config(self):
with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file:
self.config = json.load(json_file)["text_config"]
# rename keys
repair_config(self.config, same_names=["num_attention_heads", "n_head"])
repair_config(self.config, same_names=["hidden_size", "n_embd", "n_embed"])
repair_config(self.config, same_names=["num_hidden_layers", "n_layer"])
if self.finetune_config:
self.config["vocab_size"] = self.finetune_config.vocab_size
return


3 changes: 3 additions & 0 deletions lightllm/server/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ..models.qwen_vl.model import QWenVLTokenizer
from ..models.qwen2_vl.model import QWen2VLTokenizer
from ..models.internvl.model import InternvlTokenizer
from ..models.interns1.model import InternS1Tokenizer
from ..models.gemma3.model import Gemma3Tokenizer

# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
Expand Down Expand Up @@ -94,6 +95,8 @@ def get_tokenizer(
)
elif model_type == "internvl_chat":
tokenizer = InternvlTokenizer(tokenizer, model_cfg, weight_dir=tokenizer_name)
elif model_type == "interns1":
tokenizer = InternS1Tokenizer(tokenizer, model_cfg, weight_dir=tokenizer_name)
elif model_type == "gemma3":
tokenizer = Gemma3Tokenizer(tokenizer, model_cfg)

Expand Down
4 changes: 3 additions & 1 deletion lightllm/server/visualserver/model_infer/model_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from transformers.configuration_utils import PretrainedConfig
from rpyc.utils.classic import obtain
from rpyc.utils.server import ThreadedServer
from lightllm.models.interns1.interns1_visual import InternS1VisionTransformer
from lightllm.models.qwen_vl.qwen_visual import QWenVisionTransformer
from lightllm.models.llava.llava_visual import LlavaVisionModel
from lightllm.models.internvl.internvl_visual import InternVLVisionModel
from lightllm.models.gemma3.gemma3_visual import Gemma3VisionModel
from lightllm.models.vit.model import VisionTransformer
from lightllm.server.multimodal_params import MultimodalParams, ImageItem
Expand Down Expand Up @@ -72,6 +72,8 @@ def exposed_init_model(self, kvargs):
# self.model = InternVLVisionModel()
elif self.model_type == "gemma3":
self.model = Gemma3VisionModel()
elif self.model_type == "interns1":
self.model = InternS1VisionTransformer()
else:
raise Exception(f"can not support {self.model_type} now")

Expand Down
3 changes: 3 additions & 0 deletions lightllm/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def get_vocab_size(model_path: str):
if "llm_config" in config_json:
vocab_size = int(config_json["llm_config"]["vocab_size"])
return vocab_size
elif "text_config" in config_json:
vocab_size = int(config_json["text_config"]["vocab_size"])
return vocab_size
vocab_size = config_json["vocab_size"]
if not isinstance(vocab_size, int):
vocab_size = int(vocab_size)
Expand Down
9 changes: 9 additions & 0 deletions test/test_api/test_multimodal_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@
"Please describe it.\n"
"<|im_end|><|im_start|>assistant\n"
),
"interns1": (
"<|im_start|>system\n"
"You are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.<|im_end|>"
"<|im_start|>user\n"
"<IMG_CONTEXT>\n"
"Please describe the image explicitly.<|im_end|>"
"<|im_start|>assistant\n"
"<think>\n"
),
"qwen_vl": (
"<|im_start|>system\n"
"You are a helpful assistant.\n"
Expand Down