vllm-project · vllm-bot · Jul 11, 2025 · Jul 11, 2025 · gemini-code-assist · Jul 11, 2025
diff --git a/docs/features/lora.md b/docs/features/lora.md
@@ -279,64 +279,64 @@ Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-s
 
 To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied.
 
-Example usage for offline inference:
+??? code "Example usage for offline inference"
 
-```python
-from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
-from vllm.assets.audio import AudioAsset
-
-model_id = "ibm-granite/granite-speech-3.3-2b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-def get_prompt(question: str, has_audio: bool):
-    """Build the input prompt to send to vLLM."""
-    if has_audio:
-        question = f"<|audio|>{question}"
-    chat = [
-        {
-            "role": "user",
-            "content": question
+    ```python
-??? code "Example usage for offline inference"
-
-```python
-from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
-from vllm.assets.audio import AudioAsset
-
-model_id = "ibm-granite/granite-speech-3.3-2b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-def get_prompt(question: str, has_audio: bool):
-    """Build the input prompt to send to vLLM."""
-    if has_audio:
-        question = f"<|audio|>{question}"
-    chat = [
-        {
-            "role": "user",
-            "content": question
-    ```python
+??? code "Example usage for offline inference"
+    ```python
-??? code "Example usage for offline inference"
-
-```python
-from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
-from vllm.assets.audio import AudioAsset
-
-model_id = "ibm-granite/granite-speech-3.3-2b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-def get_prompt(question: str, has_audio: bool):
-    """Build the input prompt to send to vLLM."""
-    if has_audio:
-        question = f"<|audio|>{question}"
-    chat = [
-        {
-            "role": "user",
-            "content": question
-    ```python
+??? code "Example usage for offline inference"
+    ```python
+    from transformers import AutoTokenizer
+    from vllm import LLM, SamplingParams
+    from vllm.assets.audio import AudioAsset
+
+    model_id = "ibm-granite/granite-speech-3.3-2b"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    def get_prompt(question: str, has_audio: bool):
+        """Build the input prompt to send to vLLM."""
+        if has_audio:
+            question = f"<|audio|>{question}"
+        chat = [
+            {
+                "role": "user",
+                "content": question
+            }
+        ]
+        return tokenizer.apply_chat_template(chat, tokenize=False)
+
+
+    model = LLM(
+        model=model_id,
+        enable_lora=True,
+        max_lora_rank=64,
+        max_model_len=2048,
+        limit_mm_per_prompt={"audio": 1},
+        # Will always pass a `LoRARequest` with the `model_id`
+        # whenever audio is contained in the request data.
+        default_mm_loras = {"audio": model_id},
+        enforce_eager=True,
+    )
+
+    question = "can you transcribe the speech into a written format?"
+    prompt_with_audio = get_prompt(
+        question=question,
+        has_audio=True,
+    )
+    audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
+
+    inputs = {
+        "prompt": prompt_with_audio,
+        "multi_modal_data": {
+            "audio": audio,
         }
-    ]
-    return tokenizer.apply_chat_template(chat, tokenize=False)
-
-
-model = LLM(
-    model=model_id,
-    enable_lora=True,
-    max_lora_rank=64,
-    max_model_len=2048,
-    limit_mm_per_prompt={"audio": 1},
-    # Will always pass a `LoRARequest` with the `model_id`
-    # whenever audio is contained in the request data.
-    default_mm_loras = {"audio": model_id},
-    enforce_eager=True,
-)
-
-question = "can you transcribe the speech into a written format?"
-prompt_with_audio = get_prompt(
-    question=question,
-    has_audio=True,
-)
-audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
-
-inputs = {
-    "prompt": prompt_with_audio,
-    "multi_modal_data": {
-        "audio": audio,
     }
-}
 
 
-outputs = model.generate(
-    inputs,
-    sampling_params=SamplingParams(
-        temperature=0.2,
-        max_tokens=64,
-    ),
-)
-```
+    outputs = model.generate(
+        inputs,
+        sampling_params=SamplingParams(
+            temperature=0.2,
+            max_tokens=64,
+        ),
+    )
+    ```
 
 You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server: