vllm-project · DarkLight1337 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
@@ -94,13 +94,13 @@ steps:
   #mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models --ignore=models/test_llava.py
+    - pytest -v -s models -m \"not llava\"
 
 - label: Llava Test
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models/test_llava.py
+    - pytest -v -s models -m llava
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,28 @@
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
+in :class:`~vllm.LLMEngine` before they are passed to model executors.
+
+.. contents::
+   :local:
+   :backlinks: none
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -0,0 +1,94 @@
+.. _adding_a_new_multimodal_model:
+
+Adding a New Multimodal Model
+=============================
+
+This document provides a high-level guide on integrating a :ref:`multimodal model <multi_modality>` into vLLM.
+
+.. note::
+    The complexity of adding a new model depends heavily on the model's architecture.
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+
+.. tip::
+    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
+    We will be happy to help you out!
+
+
+0. Set up a base vLLM model
+---------------------------
+
+Follow :ref:`these steps <adding_a_new_model>` to first implement the model in vLLM.
+While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+.. code-block:: diff
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    +   pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+
+.. note::
+    The model class does not have to be named :code:`*ForCausalLM`.
+    Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+
+
+1. Register input mappers
+-------------------------
+
+For each modality type to support, decorate the model class with :meth:`vllm.INPUT_REGISTRY.MULTIMODAL.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
+
+.. code-block:: diff
+
+    + from vllm.inputs import INPUT_REGISTRY
+
+    + @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+    + @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    class YourModelForImage2Seq(nn.Module):
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+
+2. (Optional) Register dummy data
+---------------------------------
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via :meth:`vllm.inputs.INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+
+    @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+    @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    class YourModelForImage2Seq(nn.Module):
+
+Refer to :class:`vllm.multimodal.image.DummyImageDataFactories` for some examples of dummy data factories.
+
+
+3. (Optional) Register input processor
+--------------------------------------
+
+Sometimes, there is a need to process inputs at the :class:~vllm.LLMEngine` level before they are passed to the model executor.
+You can register input processors via :meth:`vllm.inputs.INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+
+    @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+    @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+    class YourModelForImage2Seq(nn.Module):
+
+A common use case of input processors is inserting extra image tokens to leverage the vLLM framework for attention mask generation.
+More details can be found in :class:`vllm.multimodal.image.ImageInputProcessors`.
+
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -1,3 +1,5 @@
+.. _multi_modality:
+
 Multi-Modality
 ==============
 
@@ -8,9 +10,15 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 which allows you to pass in multi-modal input alongside text and token prompts.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`this guide <adding_a_new_multimodal_model>`.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   adding_multimodal_model
 
 .. contents::
    :local:
@@ -24,9 +32,7 @@ Module Contents
 Registry
 --------
 
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -107,6 +107,7 @@ Documentation
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
@@ -37,7 +37,7 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
@@ -75,7 +75,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.

diff --git a/pyproject.toml b/pyproject.toml
@@ -71,4 +71,5 @@ markers = [
     "skip_global_cleanup",
     "llm: run tests for vLLM API only",
     "openai: run tests for OpenAI API only",
+    "llava: run tests for LLaVA models only",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -29,24 +29,19 @@
 
 # Multi modal related
 # You can use `.buildkite/download-images.sh` to download the assets
-_PIXEL_VALUES_FILES = [
+PIXEL_VALUES_FILES = [
     os.path.join(_TEST_DIR, "images", filename) for filename in
     ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
 ]
-_IMAGE_FEATURES_FILES = [
+IMAGE_FEATURES_FILES = [
     os.path.join(_TEST_DIR, "images", filename) for filename in
     ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
 ]
-_IMAGE_FILES = [
+IMAGE_FILES = [
     os.path.join(_TEST_DIR, "images", filename)
     for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
 ]
-_IMAGE_PROMPTS = [
-    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
-    "<image>\nUSER: What is the season?\nASSISTANT:"
-]
-assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
-    _IMAGE_FILES) == len(_IMAGE_PROMPTS)
+assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -84,14 +79,9 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup()
 
 
-@pytest.fixture(scope="session")
-def hf_image_prompts() -> List[str]:
-    return _IMAGE_PROMPTS
-
-
 @pytest.fixture(scope="session")
 def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in _IMAGE_FILES]
+    return [Image.open(filename) for filename in IMAGE_FILES]
 
 
 @pytest.fixture()
@@ -101,26 +91,17 @@ def vllm_images(request) -> List[MultiModalData]:
             VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
         return [
             ImageFeatureData(torch.load(filename))
-            for filename in _IMAGE_FEATURES_FILES
+            for filename in IMAGE_FEATURES_FILES
         ]
     else:
         return [
-            ImagePixelData(Image.open(filename)) for filename in _IMAGE_FILES
+            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
         ]
 
 
 @pytest.fixture()
 def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in _PIXEL_VALUES_FILES]
-
-
-@pytest.fixture()
-def vllm_image_prompts(request) -> List[str]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    return [
-        "<image>" * (vision_language_config.image_feature_size - 1) + p
-        for p in _IMAGE_PROMPTS
-    ]
+    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
 
 
 @pytest.fixture