PaddlePaddle · nemonameless · Dec 18, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 8, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/0.sh b/0.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=7 python paddlemix/examples/mPLUG_Owl3/run_inference.py
diff --git a/build_env.sh b/build_env.sh
@@ -21,19 +21,19 @@ echo "开始安装 PaddleMIX 及其依赖..."
 
 # 安装 PaddleMIX
 echo "安装 PaddleMIX..."
-pip install -e .
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple/
 
 # 安装 ppdiffusers
 echo "安装 ppdiffusers..."
 cd ppdiffusers
-pip install -e .
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple/
 cd ..
 #注：ppdiffusers部分模型需要依赖 CUDA 11.2 及以上版本，如果本地机器不符合要求，建议前往 [AI Studio](https://aistudio.baidu.com/index) 进行模型训练、推理任务。
 #如果希望使用**bf16**训练推理，请使用支持**bf16**的GPU，如A100。
 
 # 安装依赖包
 echo "安装依赖包..."
-pip install -r requirements.txt
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
 
 # 安装自定义算子，非CUDA环境（例如昇腾环境）则跳过
 if command -v nvcc > /dev/null 2>&1; then

diff --git a/paddlemix/examples/mPLUG_Owl3/README.md b/paddlemix/examples/mPLUG_Owl3/README.md
@@ -0,0 +1,50 @@
+# mPLUG-Owl3
+
+## 1. 模型介绍
+
+**本仓库支持的模型权重:**
+
+| Model              |
+|--------------------|
+<!-- | mPLUG/mPLUG-Owl3-1B-241014  |
+| mPLUG/mPLUG-Owl3-2B-241014  | -->
+| mPLUG/mPLUG-Owl3-7B-241101  |
+
+注意：与huggingface权重同名，但权重为paddle框架的Tensor，使用`xxx.from_pretrained("mPLUG/mPLUG-Owl3-7B-241101")`即可自动下载该权重文件夹到缓存目录。
+
+
+## 2 环境准备
+
+1）[安装 PaddleMIX 环境依赖包](https://github.com/PaddlePaddle/PaddleMIX/tree/develop?tab=readme-ov-file#%E5%AE%89%E8%A3%85)
+
+2）pip install pillow tqdm paddlenlp==3.0.0b2
+
+注意：Python版本最好为3.10及以上版本。
+
+## 3 快速开始
+
+### 推理
+```bash
+# 图片理解
+python paddlemix/examples/mPLUG_Owl3/run_inference.py \
+
+# 视频理解
+python paddlemix/examples/mPLUG_Owl3/run_inference_video.py \
+```
+
+### 效果展示
+
+
+
+### 参考文献
+```BibTeX
+@misc{ye2024mplugowl3longimagesequenceunderstanding,
+      title={mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models},
+      author={Jiabo Ye and Haiyang Xu and Haowei Liu and Anwen Hu and Ming Yan and Qi Qian and Ji Zhang and Fei Huang and Jingren Zhou},
+      year={2024},
+      eprint={2408.04840},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2408.04840},
+}
+```
diff --git a/paddlemix/examples/mPLUG_Owl3/requirement.txt b/paddlemix/examples/mPLUG_Owl3/requirement.txt
@@ -0,0 +1,3 @@
+pillow
+tqdm
+paddlenlp==3.0.0b2
diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference.py b/paddlemix/examples/mPLUG_Owl3/run_inference.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PIL import Image
+import paddle
+from paddlenlp.transformers import Qwen2Tokenizer
+from paddlemix.models.mPLUGOwl3.configuration_mplugowl3 import mPLUGOwl3Config
+from paddlemix.models.mPLUGOwl3.modeling_mplugowl3 import mPLUGOwl3Model
+# from paddlemix.models.mPLUGOwl3.processing_mplugowl3 import mPLUGOwl3Processor
+# from paddlemix.models.mPLUGOwl3.image_processing_mplugowl3 import mPLUGOwl3ImageProcessor
+
+#model_path = 'mPLUG/mPLUG-Owl3-7B-241101'
+model_path = 'mPLUG-Owl3-7B-241101'
+
+config = mPLUGOwl3Config.from_pretrained(model_path)
+# print(config)
+model = mPLUGOwl3Model.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
+tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
+processor = model.init_processor(tokenizer)
+
+#image = Image.new('RGB', (500, 500), color='red')
+image = Image.open("paddlemix/demo_images/examples_image1.jpg").convert("RGB")
+
+messages = [
+    {"role": "user", "content": """<|image|>Describe this image."""},
+    {"role": "assistant", "content": ""}
+]
+
+inputs = processor(messages, images=[image], videos=None)
+inputs['pixel_values'] = inputs['pixel_values'].cast(paddle.bfloat16)
+# inputs['input_ids'] [1, 72] # torch [1, 74]
+# inputs['input_ids'] = paddle.to_tensor([[151644,   8948,    198, 151645,    198, 151644,    872,    198,     27,
+#              91,   2468,  41317,     91,     29,     17,      9,     18,    198,
+#              27,     91,   1805,     91,     29,    220,     27,     91,   1805,
+#              91,     29,    220,     27,     91,   1805,     91,     29,    198,
+#              27,     91,   1805,     91,     29,    220,     27,     91,   1805,
+#              91,     29,    220,     27,     91,   1805,     91,     29,    198,
+#              27,     91,   1805,     91,     29,     27,     91,    408,  41317,
+#              91,     29,  74785,    419,   2168,     13, 151645,    198, 151644,
+#           77091,    198]]).astype(paddle.int64)
+# inputs['media_offset'] [17, 23, 29, 35, 41, 47, 53]
+# inputs['pixel_values'] [7, 3, 378, 378] sum 629145600
+
+import numpy as np
+inputs['pixel_values'] = paddle.to_tensor(np.load('pixel_values.npy')).cast(paddle.bfloat16)
+inputs['media_offset'] = [paddle.to_tensor([18, 24, 30, 36, 42, 48, 54])]
+
+inputs.update({
+    'tokenizer': tokenizer,
+    'max_new_tokens':100,
+    'decode_text':True,
+})
+
+g = model.generate(**inputs)
+print(g)
diff --git a/paddlemix/examples/mPLUG_Owl3/run_inference_video.py b/paddlemix/examples/mPLUG_Owl3/run_inference_video.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import base64
+import io
+from typing import Dict, List
+
+import paddle
+import PIL.Image
+from paddlenlp.transformers import LlamaTokenizerFast
+
+from paddlemix.models.janus import JanusMultiModalityCausalLM
+from paddlemix.processors import JanusImageProcessor, JanusVLChatProcessor
+
+import paddle
+model_path = 'mPLUG/mPLUG-Owl3-7B-241101'
+
+config = AutoConfig.from_pretrained(model_path)
+print(config)
+model = AutoModel.from_pretrained(model_path, dtype=paddle.bfloat16).eval()
+
+from PIL import Image
+
+from modelscope import AutoTokenizer
+from decord import VideoReader, cpu    # pip install decord
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+processor = model.init_processor(tokenizer)
+
+
+messages = [
+    {"role": "user", "content": """<|video|>
+Describe this video."""},
+    {"role": "assistant", "content": ""}
+]
+
+videos = ['/nas-mmu-data/examples/car_room.mp4']
+
+MAX_NUM_FRAMES=16
+
+def encode_video(video_path):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    print('num frames:', len(frames))
+    return frames
+video_frames = [encode_video(_) for _ in videos]
+inputs = processor(messages, images=None, videos=video_frames)
+
+inputs.update({
+    'tokenizer': tokenizer,
+    'max_new_tokens':100,
+    'decode_text':True,
+})
+
+g = model.generate(**inputs)
+print(g)
diff --git a/paddlemix/models/mPLUGOwl3/__init__.py b/paddlemix/models/mPLUGOwl3/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_hyper_qwen2 import *
+from .configuration_mplugowl3 import *
+from .image_processing_mplugowl3 import *
+from .modeling_hyper_qwen2 import *
+from .modeling_mplugowl3 import *
+from .modeling_navit_siglip import *
+from .processing_mplugowl3 import *
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		CUDA_VISIBLE_DEVICES=7 python paddlemix/examples/mPLUG_Owl3/run_inference.py