From d5b7511a3c51937abf7b21402b826e28de58aabd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cihan=20Yal=C3=A7=C4=B1n?= <113353248+g-hano@users.noreply.github.com> Date: Tue, 8 Oct 2024 03:21:35 +0300 Subject: [PATCH] MultiModal.HuggingFaceMultiModal: fix errors and README, add stream_complete (#16376) fix imports --- .../README.md | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md index 4b698974d5ea3..10dcaea5bee98 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md @@ -35,7 +35,7 @@ Here's a basic example of how to use the Hugging Face multimodal integration: ```python from llama_index.multi_modal_llms.huggingface import HuggingFaceMultiModal -from llama_index.schema import ImageDocument +from llama_index.core.schema import ImageDocument # Initialize the model model = HuggingFaceMultiModal.from_model_name("Qwen/Qwen2-VL-2B-Instruct") @@ -50,14 +50,45 @@ response = model.complete(prompt, image_documents=[image_document]) print(response.text) ``` +### Streaming + +```python +from llama_index.multi_modal_llms.huggingface import HuggingFaceMultiModal +from llama_index.core.schema import ImageDocument + +# Initialize the model +model = HuggingFaceMultiModal.from_model_name("Qwen/Qwen2-VL-2B-Instruct") + +# Prepare your image and prompt +image_document = ImageDocument(image_path="downloaded_image.jpg") +prompt = "Describe this image in detail." + +import nest_asyncio +import asyncio + +nest_asyncio.apply() + + +async def stream_output(): + for chunk in model.stream_complete( + prompt, image_documents=[image_document] + ): + print(chunk.delta, end="", flush=True) + await asyncio.sleep(0) + + +asyncio.run(stream_output()) +``` + You can also refer to this [Colab notebook](examples\huggingface_multimodal.ipynb) ## Supported Models -1. Qwen2VisionMultiModal -2. Florence2MultiModal -3. Phi35VisionMultiModal -4. PaliGemmaMultiModal +1. Qwen2 Vision +2. Florence2 +3. Phi3.5 Vision +4. PaliGemma +5. Mllama Each model has its unique capabilities and can be selected based on your specific use case.