Merge branch 'mp3' of https://github.com/WAYKEN-TSE/PaddleMIX into mp3

PaddlePaddle · Dec 18, 2024 · de8d7e5 · de8d7e5
2 parents a71f200 + 45d4bc2
commit de8d7e5
Show file tree

Hide file tree

Showing 17 changed files with 149 additions and 273 deletions.
diff --git a/README.md b/README.md
@@ -49,6 +49,10 @@
 
 <!-- 📚《飞桨多模态大模型开发套件PaddleMIX 2.1 震撼发布》，图文音视频场景全覆盖，多模态高效助力产业创新。超大规模训练支持，覆盖图文预训练、文生图、跨模态视觉任务，覆盖金融、教育、电商、医疗等产业场景。8月8日（周四）20：00 带你直播了解多模态大模型最新架构，深度解析PaddleMIX高性能模型库，手把手演示LLaVA模型训推全流程。[报名链接](https://www.wjx.top/vm/wKqysjx.aspx?udsid=449688)   -->
 
+**🎉 2024.12.17 支持[GOT-OCR2_0](./paddlemix/examples/GOT_OCR_2_0)推理和训练**
+
+**🎉 2024.12.17 支持[InternVL2_5(1B、2B、4B、8B)](./paddlemix/examples/internvl2)推理**
+
 **🎉 2024.11.27 支持[Janus/JanusFlow](./paddlemix/examples/janus)推理**
 
 **🎉 2024.11.21 支持[MiniCPM-V-2_6](./paddlemix/examples/minicpm-v-2_6)推理**

diff --git a/README_EN.md b/README_EN.md
@@ -48,6 +48,7 @@ Welcome your submissions!
 
 ## 📣 Latest Developments
 
+**🎉 2024.12.17 Support for [InternVL2_5 (1B, 2B, 4B, 8B)](./paddlemix/examples/internvl2) inference**
 
 **🎉 2024.11.27 Added support for [Janus/JanusFlow](./paddlemix/examples/janus) inference**
 

diff --git a/paddlemix/examples/GOT_OCR_2_0/README.md b/paddlemix/examples/GOT_OCR_2_0/README.md
@@ -2,7 +2,15 @@
 
 ## 1. 模型介绍
 
-[GOT-OCR2.0](https://arxiv.org/abs/2409.01704)是一款极具突破性的通用OCR模型，旨在解决传统OCR系统（OCR-1.0）和当前大规模视觉语言模型（LVLMs）在OCR任务中的局限性。本仓库提供paddle版本的`GOT-OCR2.0`模型。
+[GOT-OCR2.0](https://arxiv.org/abs/2409.01704)是由 StepFun 和中国科学院大学推出的专用于通用 OCR 任务的多模态大模型，参数量 0.6B，是一款极具突破性的通用OCR多模态模型，旨在解决传统OCR系统（OCR-1.0）和当前大规模视觉语言模型（LVLMs）在OCR任务中的局限性。
+
+**本仓库支持的模型权重:**
+
+| Model              |
+|--------------------|
+| stepfun-ai/GOT-OCR2_0  |
+
+注意：与huggingface权重同名，但权重为paddle框架的Tensor，使用`xxx.from_pretrained("stepfun-ai/GOT-OCR2_0")`即可自动下载该权重文件夹到缓存目录。
 
 
 ## 2. 环境要求
@@ -36,11 +44,39 @@ python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
   --ocr_type format \
 ```
 
+### 3.3. multi_crop plain texts OCR:
+```bash
+python paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py \
+  --model_name_or_path stepfun-ai/GOT-OCR2_0 \
+  --image_file paddlemix/demo_images/hospital.jpeg \
+  --ocr_type ocr \
+  --multi_crop \
+```
+
 ## 4 训练
+
+与[官方github代码库](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/?tab=readme-ov-file#train)一样，目前仅支持基于GOT权重的post-training(stage-2/stage-3)，其中stage2是全参数微调，stage3是冻结vision encoder后微调，默认训练方式是stage2全参数微调，训练显存约10GB每卡。
+
+### 数据集下载
+PaddleMIX团队提供了一个改版的SynthDoG-EN数据集，统一修改了其原先的question为```<image>\nOCR:```，下载链接为：
+```
+wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/synthdog_en.tar # 2.4G
+```
+synthdog_en.tar包括了图片images文件夹和标注json文件，需下载解压或软链接在PaddleMIX/目录下。
+
+### 数据集格式
+
+同[官方例子](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/blob/main/assets/train_sample.jpg)，其中question统一为```<image>\nOCR:```，answer是其OCR结果。
+
+
+### 训练命令
+
 ```bash
 sh paddlemix/examples/GOT_OCR_2_0/run_train.sh
 ```
 
+注意：默认训练方式是stage2全参数微调，训练显存约10GB每卡。也可通过设置```--freeze_vision_tower True```冻结vision encoder后微调。
+
 
 ## 参考文献
 ```BibTeX

diff --git a/paddlemix/examples/GOT_OCR_2_0/configs/demo_dataset.json b/paddlemix/examples/GOT_OCR_2_0/configs/demo_dataset.json
@@ -1,6 +1,6 @@
 {
   "synthdog_en": {
     "images": "synthdog_en/",
-    "annotations": "synthdog_en/synthdog_en_29765_ocr_1k.json"
+    "annotations": "synthdog_en/synthdog_en_29765_ocr.json"
   }
 }
diff --git a/paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py b/paddlemix/examples/GOT_OCR_2_0/got_ocr2_0_infer.py
@@ -13,12 +13,16 @@
 # limitations under the License.
 
 import argparse
+
 import paddle
 from paddlenlp.transformers import QWenTokenizer
+
 from paddlemix.models.GOT.GOT_ocr_2_0 import GOTQwenForCausalLM
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--model_name_or_path", type=str, default="stepfun-ai/GOT-OCR2_0", help="pretrained ckpt and tokenizer")
+parser.add_argument(
+    "--model_name_or_path", type=str, default="stepfun-ai/GOT-OCR2_0", help="pretrained ckpt and tokenizer"
+)
 parser.add_argument("--image_file", type=str, default="paddlemix/demo_images/hospital.jpeg")
 parser.add_argument("--multi_crop", action="store_true")
 parser.add_argument("--ocr_type", type=str, default="plain", choices=["ocr", "format"])
@@ -38,41 +42,9 @@
 with paddle.no_grad():
     if args.multi_crop:
         # multi-crop OCR:
-        res = model.chat_crop(
-            tokenizer, image_file, ocr_type=args.ocr_type, render=args.render, save_render_file="./demo.html"
-        )
+        res = model.chat_crop(tokenizer, image_file, ocr_type=args.ocr_type)
     else:
         # plain texts OCR
         # format texts OCR
-        # fine-grained OCR
-        # render the formatted OCR results
-        res = model.chat(
-            tokenizer,
-            image_file,
-            ocr_type=args.ocr_type,
-            ocr_box=args.box,
-            ocr_color=args.color,
-            render=args.render,
-            save_render_file="./demo.html",
-        )
-
-    # plain texts OCR
-    # res = model.chat(tokenizer, image_file, ocr_type='ocr')
-
-    # format texts OCR:
-    # res = model.chat(tokenizer, image_file, ocr_type='format')
-
-    # fine-grained OCR:
-    # res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_box='')
-    # res = model.chat(tokenizer, image_file, ocr_type='format', ocr_box='')
-    # res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_color='')
-    # res = model.chat(tokenizer, image_file, ocr_type='format', ocr_color='')
-
-    # multi-crop OCR:
-    # res = model.chat_crop(tokenizer, image_file, ocr_type='ocr')
-    # res = model.chat_crop(tokenizer, image_file, ocr_type='format')
-
-    # render the formatted OCR results:
-    # res = model.chat(tokenizer, image_file, ocr_type='format', render=True, save_render_file = './demo.html')
-
-    print(res)
+        res = model.chat(tokenizer, image_file, ocr_type=args.ocr_type)
+    print("output:\n", res)
diff --git a/paddlemix/examples/GOT_OCR_2_0/run_train.sh b/paddlemix/examples/GOT_OCR_2_0/run_train.sh
@@ -15,7 +15,7 @@
 set -x
 
 GPUS=${GPUS:-8}
-BATCH_SIZE=${BATCH_SIZE:-8}
+BATCH_SIZE=${BATCH_SIZE:-32}
 PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
 
 GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
@@ -28,8 +28,6 @@ export TF_CPP_MIN_LOG_LEVEL=3
 
 OUTPUT_DIR='work_dirs/got_ocr_20'
 
-# meta='pdf-ocr+scence'
-
 if [ ! -d "$OUTPUT_DIR" ]; then
   mkdir -p "$OUTPUT_DIR"
 fi
@@ -38,6 +36,8 @@ TRAINING_MODEL_RESUME="None"
 TRAINER_INSTANCES='127.0.0.1'
 MASTER='127.0.0.1:8080'
 
+# --freeze_vision_tower False \ # True for stage3
+
 TRAINING_PYTHON="python -m paddle.distributed.launch --master ${MASTER} --nnodes 1 --nproc_per_node ${GPUS} --rank 0 --ips ${TRAINER_INSTANCES} --run_mode=collective"
 ${TRAINING_PYTHON} --log_dir ${OUTPUT_DIR}/paddle_distributed_logs \
   paddlemix/examples/GOT_OCR_2_0/train_GOT.py \

diff --git a/paddlemix/examples/GOT_OCR_2_0/train_GOT.py b/paddlemix/examples/GOT_OCR_2_0/train_GOT.py
@@ -14,22 +14,23 @@
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 
+import logging
 import os
 import sys
-import paddle.distributed as dist
-import paddle
-import paddlenlp
-from paddlemix.datasets.got_dataset import make_supervised_data_module
-from paddlemix.models.GOT.GOT_ocr_2_0 import GOTQwenForCausalLM
-from paddlenlp.trainer.trainer_utils import get_last_checkpoint
+from dataclasses import dataclass, field
+from typing import Optional
 
-from paddlemix.models.GOT.utils.utils import smart_tokenizer_and_embedding_resize
+import paddle
+import paddle.distributed as dist
 from paddlenlp.trainer import PdArgumentParser, TrainingArguments, set_seed
 from paddlenlp.trainer.trainer import Trainer
-from dataclasses import dataclass, field
-from typing import Dict, Optional
+from paddlenlp.trainer.trainer_utils import get_last_checkpoint
 from paddlenlp.transformers import QWenTokenizer
-import logging
+
+from paddlemix.datasets.got_dataset import make_supervised_data_module
+from paddlemix.models.GOT.GOT_ocr_2_0 import GOTQwenForCausalLM
+from paddlemix.models.GOT.utils.utils import smart_tokenizer_and_embedding_resize
+
 logger = logging.getLogger(__name__)
 
 
@@ -57,8 +58,8 @@ class ModelArguments:
     vision_tower: Optional[str] = field(default="openai/clip-vit-large-patch14")
     freeze_vision_tower: bool = field(default=False)
     freeze_lm_model: bool = field(default=False)
-    pretrained_stage1_model: Optional[str] = field(default=None) # mlp &/ vision tower
-    vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    pretrained_stage1_model: Optional[str] = field(default=None)  # mlp &/ vision tower
+    vision_select_layer: Optional[int] = field(default=-1)  # default to the last layer
     use_im_start_end: bool = field(default=False)
 
 
@@ -71,14 +72,14 @@ class DataArguments:
     )
     sep_image_conv_front: bool = False
     image_token_len: int = 256
-    image_aspect_ratio: str = 'square'
-    conversation_version: str = 'mpt'
+    image_aspect_ratio: str = "square"
+    conversation_version: str = "mpt"
     box_limit: int = 0
     max_seq_length: int = 8192
 
 
 @dataclass
-class TrainingArguments(paddlenlp.trainer.TrainingArguments):
+class GOTTrainingArguments(TrainingArguments):
     cache_dir: Optional[str] = field(default=None)
     optim: str = field(default="adamw_torch")
     remove_unused_columns: bool = field(default=False)
@@ -87,10 +88,7 @@ class TrainingArguments(paddlenlp.trainer.TrainingArguments):
     with_box: bool = field(default=False)
     model_max_length: int = field(
         default=512,
-        metadata={
-            "help":
-            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
-        },
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
     )
     lora_enable: bool = False
     lora_r: int = 8
@@ -101,9 +99,7 @@ class TrainingArguments(paddlenlp.trainer.TrainingArguments):
 
 
 def train():
-    # parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    # model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    parser = PdArgumentParser((ModelArguments, DataArguments, GOTTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script, and it's the path to a json file,
         # let's parse it to get our arguments.
@@ -147,19 +143,17 @@ def train():
     print(f"Loading Tokenizer: {tokenizer_path}")
 
     tokenizer = QWenTokenizer.from_pretrained(
-        model_args.model_name_or_path,
-        padding_side="right",
-        model_max_length=training_args.model_max_length)
+        model_args.model_name_or_path, padding_side="right", model_max_length=training_args.model_max_length
+    )
     print("tokenizer", tokenizer)
-    print("len(tokenizer)", len(tokenizer))
-    print("tokenizer.added_tokens_encoder", tokenizer.added_tokens_encoder)
-    print("tokenizer.added_tokens_decoder", tokenizer.added_tokens_decoder)
+    # print("len(tokenizer)", len(tokenizer))
+    # print("tokenizer.added_tokens_encoder", tokenizer.added_tokens_encoder)
+    # print("tokenizer.added_tokens_decoder", tokenizer.added_tokens_decoder)
 
-    model = GOTQwenForCausalLM.from_pretrained(
-        model_args.model_name_or_path, dtype=dtype)
+    model = GOTQwenForCausalLM.from_pretrained(model_args.model_name_or_path, dtype=dtype)
 
     smart_tokenizer_and_embedding_resize(
-        special_tokens_dict=dict(pad_token='<|endoftext|>'),
+        special_tokens_dict=dict(pad_token="<|endoftext|>"),
         tokenizer=tokenizer,
         model=model,
     )
@@ -174,16 +168,15 @@ def train():
     )
 
     model.initialize_vision_tokenizer(
-        tokenizer=tokenizer, 
-        freeze_lm_model=model_args.freeze_lm_model, 
+        tokenizer=tokenizer,
+        freeze_lm_model=model_args.freeze_lm_model,
         pretrained_stage1_model=model_args.pretrained_stage1_model,
     )
 
     # 'image_processor_high
-    # data_args.image_token_len = vision_tower_dict['image_token_len']
     data_args.image_token_len = 256
-    data_args.image_processor = vision_tower_dict['image_processor']
-    data_args.image_processor_high = vision_tower_dict['image_processor_high']
+    data_args.image_processor = vision_tower_dict["image_processor"]
+    data_args.image_processor_high = vision_tower_dict["image_processor_high"]
     data_args.use_im_start_end = model_args.use_im_start_end
 
     def _freeze_params(module):
@@ -199,11 +192,9 @@ def _freeze_params(module):
     if model_args.freeze_vision_tower:
         _freeze_params(model.qwen2.vision_tower_high)
 
-    # params_grad = [p.numel() for n, p in model.named_parameters() if p.requires_grad]
-    # print(f"Number of Mapping Trainable Parameters: {sum(params_grad) / (1 << 20):.2f} M")
     print_trainable_params(model)
-    # trainable params: 464959488 || all params: 560528640 || trainable%: 82.9502
-
+    # trainable params: 464959488 || all params: 560528640 || trainable%: 82.9502 # stage3
+    # trainable params: 560528640 || all params: 560528640 || trainable%: 100 # stage2
     params_grad = [p.numel() for n, p in model.named_parameters() if not p.stop_gradient]
     print(f"Number of Mapping Trainable Parameters: {sum(params_grad) / (1 << 20):.2f} M")
 
@@ -217,13 +208,9 @@ def _freeze_params(module):
     set_seed(training_args.seed)
 
     data_module = make_supervised_data_module(
-        interleave=training_args.interleave, 
-        with_box=training_args.with_box, 
-        tokenizer=tokenizer, 
-        data_args=data_args
+        interleave=training_args.interleave, with_box=training_args.with_box, tokenizer=tokenizer, data_args=data_args
     )
 
-    #trainer = GOTTrainer(
     trainer = Trainer(
         model=model,
         args=training_args,

diff --git a/paddlemix/examples/internvl2/README.md b/paddlemix/examples/internvl2/README.md
@@ -26,8 +26,12 @@
 | Model              |
 |--------------------|
 | OpenGVLab/InternVL2-1B  |
+| OpenGVLab/InternVL2_5-1B  |
 | OpenGVLab/InternVL2-2B  |
+| OpenGVLab/InternVL2_5-2B  |
+| OpenGVLab/InternVL2_5-4B  |
 | OpenGVLab/InternVL2-8B  |
+| OpenGVLab/InternVL2_5-8B  |
 | OpenGVLab/InternVL2-26B |
 | OpenGVLab/InternVL2-40B |
 | OpenGVLab/InternVL2-8B-MPO |

diff --git a/paddlemix/examples/internvl2/chat_demo.py b/paddlemix/examples/internvl2/chat_demo.py
@@ -99,12 +99,14 @@ def load_tokenizer(model_path):
     import re
 
     match = re.search(r"\d+B", model_path)
+    model2_5 = "InternVL2_5" in model_path 
     if match:
         model_size = match.group()
     else:
         model_size = "2B"
-
-    if model_size in ["1B"]:
+    if model2_5 and model_size in ["1B", "4B"]:
+        tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
+    elif model_size in ["1B"]:
         tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
     elif model_size in ["2B", "8B", "26B"]:
         tokenizer = InternLM2Tokenizer.from_pretrained(model_path)
@@ -135,8 +137,7 @@ def main(args):
     print("len(tokenizer): ", len(tokenizer))
 
     model = InternVLChatModel.from_pretrained(MODEL_PATH, dtype=args.dtype).eval()
-
-    generation_config = dict(max_new_tokens=1024, do_sample=False)
+    generation_config = dict(max_new_tokens=1024, do_sample=False, top_p=0.01)
 
     with paddle.no_grad():
         response, history = model.chat(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -48,6 +48,7 @@ Welcome your submissions!

		## 📣 Latest Developments

		🎉 2024.12.17 Support for [InternVL2_5 (1B, 2B, 4B, 8B)](./paddlemix/examples/internvl2) inference

		🎉 2024.11.27 Added support for [Janus/JanusFlow](./paddlemix/examples/janus) inference

Expand Down