From 781c27319c90e3069a7f21b8529ff8ca19865d91 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 12:20:01 -0800
Subject: [PATCH 01/25] Initial model support

---
 examples/python/qwen2_5_vl_inference.py | 495 ++++++++++++++++++++++++
 src/config.cpp                          | 137 ++++++-
 src/config.h                            |  20 +
 src/generators.cpp                      |  15 +-
 src/models/decoder_only.cpp             |  17 +-
 src/models/decoder_only.h               |   2 +-
 src/models/decoder_only_pipeline.cpp    |   4 +-
 src/models/decoder_only_pipeline.h      |  15 +-
 src/models/kv_cache.cpp                 |  20 +-
 src/models/model.cpp                    |   3 +
 src/models/model_type.h                 |   2 +-
 src/models/qwen_vl_model.cpp            | 314 +++++++++++++++
 src/models/qwen_vl_model.h              |  43 ++
 src/models/qwen_vl_vision.cpp           | 303 +++++++++++++++
 src/models/qwen_vl_vision.h             |  78 ++++
 15 files changed, 1422 insertions(+), 46 deletions(-)
 create mode 100644 examples/python/qwen2_5_vl_inference.py
 create mode 100644 src/models/qwen_vl_model.cpp
 create mode 100644 src/models/qwen_vl_model.h
 create mode 100644 src/models/qwen_vl_vision.cpp
 create mode 100644 src/models/qwen_vl_vision.h

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
new file mode 100644
index 0000000000..a801978815
--- /dev/null
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -0,0 +1,495 @@
+import argparse
+import json
+import sys
+import numpy as np
+from pathlib import Path
+from PIL import Image
+from transformers import AutoTokenizer
+
+import onnxruntime_genai as og  # Requires built/installed onnxruntime-genai Python package
+
+# ----------------------------------------------------------------------------
+# Helper: build expanded image token sequence matching vision embeddings count
+# ----------------------------------------------------------------------------
+IMAGE_PAD_TOKEN = "<|image_pad|>"
+VISION_START = "<|vision_start|>"
+VISION_END = "<|vision_end|>"
+IM_START = "<|im_start|>"
+IM_END = "<|im_end|>"
+SYSTEM_PROMPT = "You are a helpful assistant."
+
+# Image preprocessing constants (from Qwen2.5-VL config)
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+PATCH_SIZE = 14
+MERGE_SIZE = 2
+TEMPORAL_PATCH_SIZE = 2
+MIN_PIXELS = 4 * 28 * 28
+MAX_RATIO = 200
+
+def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS):
+    """Baseline's smart_resize logic - rescales image maintaining aspect ratio."""
+    import math
+    
+    def ceil_by_factor(number, factor):
+        return math.ceil(number / factor) * factor
+    
+    def floor_by_factor(number, factor):
+        return math.floor(number / factor) * factor
+    
+    def round_by_factor(number, factor):
+        return round(number / factor) * factor
+    
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(f"Aspect ratio must be smaller than {MAX_RATIO}")
+    
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    
+    return h_bar, w_bar
+
+def load_prepatched_embeddings(image_path: Path, resize_width=800, resize_height=480):
+    """Load image and convert to pre-patched embeddings format matching baseline.
+    
+    This matches the baseline's approach: manually patch the image in Python
+    before passing to the ONNX vision pipeline. The patch_embed model expects
+    pre-patched data (1, num_patches, patch_dim), NOT raw pixels (B, C, H, W).
+    
+    Args:
+        image_path: Path to image file
+        resize_width: Target width for first resize (default 800)
+        resize_height: Target height for first resize (default 480)
+    
+    Returns:
+        pixel_values: np.ndarray of shape (1, num_patches, patch_dim)
+        grid_thw: (t, h, w) grid dimensions after patching
+    """
+    # Load and convert to RGB
+    img = Image.open(image_path).convert("RGB")
+    orig_w, orig_h = img.size
+    
+    # Two-stage resize matching baseline:
+    # 1. First resize to target dimensions with factor=28 constraint
+    patch_merge_size = PATCH_SIZE * MERGE_SIZE  # 14 * 2 = 28
+    h1, w1 = smart_resize(resize_height, resize_width, factor=patch_merge_size, 
+                         min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+    img = img.resize((w1, h1), Image.BICUBIC)
+    
+    # 2. Second smart_resize with same constraints (matches baseline fetch_image_data)
+    h2, w2 = smart_resize(h1, w1, factor=patch_merge_size,
+                         min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+    img = img.resize((w2, h2), Image.BICUBIC)
+    
+    print(f"[INFO] Resized image: {orig_w}x{orig_h} -> {w1}x{h1} -> {w2}x{h2}")
+    
+    # Convert to numpy array (H, W, C) and normalize to [0, 1]
+    pixel_array = np.array(img).astype(np.float32) / 255.0
+    
+    # Apply ImageNet normalization (from Qwen2.5-VL processor config)
+    mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
+    std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
+    pixel_array = (pixel_array - mean) / std
+    
+    # --- Patching logic from baseline image_utils.patch_image ---
+    # Start with (H, W, C) format, add batch dimension
+    patches = np.array([pixel_array])  # shape: (1, H, W, C)
+    
+    # Convert to (B, C, H, W) format
+    patches = patches.transpose(0, 3, 1, 2)  # shape: (1, C, H, W)
+    
+    # Handle temporal dimension (for video, but we use single frame)
+    if patches.shape[0] % TEMPORAL_PATCH_SIZE != 0:
+        repeats = np.repeat(patches[-1][np.newaxis], TEMPORAL_PATCH_SIZE - 1, axis=0)
+        patches = np.concatenate([patches, repeats], axis=0)
+    
+    channel = patches.shape[1]
+    grid_t = patches.shape[0] // TEMPORAL_PATCH_SIZE
+    grid_h = h2 // PATCH_SIZE
+    grid_w = w2 // PATCH_SIZE
+    
+    # Reshape into patches with spatial merging
+    patches = patches.reshape(
+        grid_t,
+        TEMPORAL_PATCH_SIZE,
+        channel,
+        grid_h // MERGE_SIZE,
+        MERGE_SIZE,
+        PATCH_SIZE,
+        grid_w // MERGE_SIZE,
+        MERGE_SIZE,
+        PATCH_SIZE,
+    )
+    
+    # Transpose to group patches spatially
+    patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+    
+    # Flatten to (num_patches, patch_dim)
+    flatten_patches = patches.reshape(
+        grid_t * grid_h * grid_w, 
+        channel * TEMPORAL_PATCH_SIZE * PATCH_SIZE * PATCH_SIZE
+    )
+    
+    # Add batch dimension: (1, num_patches, patch_dim)
+    pixel_values = flatten_patches[np.newaxis, :]
+    # Calculate grid dimensions for image_grid_thw
+    grid_thw = np.array([[grid_t, grid_h, grid_w]], dtype=np.int64)
+    
+    return pixel_values, grid_thw
+
+TOOL_CALL_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
+
+The functions at your disposal are:
+<tools>
+{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer based on screenshots.\n- This is an interface to a web browser. You do not have access to a terminal or applications menu, only the browser.\n- Some pages, etc. may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click a home page icon and a window doesn't change, try wait and taking another screenshot.\n- Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n- If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n- Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\n- When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\nScreen resolution: 1428x896", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* `key`: Press keyboard keys, like \"Enter\", \"Alt\", \"Shift\", \"Tab\", \"Control\", \"Backspace\", \"Delete\", \"Escape\", etc. Keys are pressed down in the order given, then released in reverse order.\n* `type`: Type a string of text on the keyboard.\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* `left_click`: Click the left mouse button.\n* `scroll`: Performs a scroll of the mouse scroll wheel.\n* `visit_url`: Visit a specified URL.\n* `web_search`: Perform a web search with a specified query.\n* `history_back`: Go back to the previous page in the browser history.\n* `pause_and_memorize_fact`: Pause and memorize a fact for future reference.\n* `wait`: Wait specified seconds for the change to happen.\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate"], "type": "string"}, "keys": {"description": "Keyboard keys to be pressed in order. Required only by `action=key`.", "type": "array"}, "text": {"description": "Text to type. Required only by `action=type`.", "type": "string"}, "press_enter": {"description": "Whether to press the 'Enter' key after typing. Required only by `action=type`.", "type": "boolean"}, "delete_existing_text": {"description": "Whether to delete existing text before typing. Required only by `action=type`.", "type": "boolean"}, "coordinate": {"description": "[x, y]: The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}, "url": {"description": "The URL to visit. Required only by `action=visit_url`.", "type": "string"}, "query": {"description": "The query to search for. Required only by `action=web_search`.", "type": "string"}, "fact": {"description": "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", "type": "string"}, "time": {"description": "Number of seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}}
+</tools>
+
+To make a function call, you should output a json object inside <tool_call></tool_call> XML tags. The json object must contain the function name and its arguments, like this:
+<tool_call>
+{\"name\": <function-name>, \"arguments\": <args-json-object>}
+</tool_call>
+"""
+
+def expand_image_tokens(grid_thw, merge_size):
+    """Compute number of image pad tokens after spatial merging.
+    Qwen2.5-VL vision pipeline produces one embedding per merged spatial cell.
+    Token count = (t * h * w) / (merge_size ** 2)."""
+    t, h, w = grid_thw
+    merge_area = merge_size ** 2
+    if (h * w) % merge_area != 0:
+        raise ValueError(f"Grid (h={h}, w={w}) not divisible by merge_size^2={merge_area}")
+    return (t * h * w) // merge_area
+
+
+def build_prompt(user_text, num_image_tokens, use_tool_call_prompt=False):
+    # Construct minimal chat-style prompt with expanded image pad tokens.
+    image_tokens = IMAGE_PAD_TOKEN * num_image_tokens
+    # Wrap in vision start/end markers once (matching template semantically) but repeated pad tokens inside.
+    vision_block = f"{VISION_START}{image_tokens}{VISION_END}"
+    system_text = TOOL_CALL_SYSTEM_PROMPT if use_tool_call_prompt else SYSTEM_PROMPT
+    prompt = (
+        f"{IM_START}system\n{system_text}{IM_END}\n"
+        f"{IM_START}user\n{vision_block}{user_text}{IM_END}\n"
+        f"{IM_START}assistant\n"
+    )
+    return prompt
+
+def build_prompt_from_sample(sample_json_path: Path, use_tool_call_prompt=False, tokenizer=None):
+    """Construct base prompt (single <|image_pad|>) from sample.json conversation using apply_chat_template."""
+    with open(sample_json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    convo = data.get("conversation", [])
+    
+    # Count images in the conversation
+    image_count = 0
+    for msg in convo:
+        if msg.get("role") == "user":
+            content_list = msg.get("content", [])
+            for c in content_list:
+                if isinstance(c, dict) and (c.get("type") == "image" or "image" in c):
+                    image_count += 1
+    
+    if image_count == 0:
+        raise ValueError("Sample JSON contained no image entries; cannot build vision prompt.")
+    
+    # Use apply_chat_template to match baseline behavior exactly
+    if tokenizer is not None:
+        prompt = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    else:
+        # Fallback to manual construction (shouldn't happen if tokenizer is passed)
+        system_text = ""
+        user_parts = []
+        for msg in convo:
+            role = msg.get("role")
+            content_list = msg.get("content", [])
+            if role == "system":
+                system_parts = [c.get("text", "") for c in content_list if isinstance(c, dict)]
+                system_text = "\n".join(system_parts)
+            elif role == "user":
+                for c in content_list:
+                    if isinstance(c, dict):
+                        if c.get("type") == "image" or "image" in c:
+                            user_parts.append(f"{VISION_START}{IMAGE_PAD_TOKEN}{VISION_END}")
+                        elif c.get("type") == "text":
+                            user_parts.append(c.get("text", ""))
+        user_text = "".join(user_parts)
+        prompt = (
+            f"{IM_START}system\n{system_text}{IM_END}\n"
+            f"{IM_START}user\n{user_text}{IM_END}\n"
+            f"{IM_START}assistant\n"
+        )
+    
+    return prompt, image_count
+
+def expand_image_tokens_in_prompt(base_prompt: str, image_grid_thw, merge_size: int):
+    """Expand single <|image_pad|> placeholder to multiple tokens based on actual image patches.
+    
+    This replicates the logic from baseline's get_image_padding_from_text:
+    - Calculates num_tokens = (t * h * w) / (merge_size^2)
+    - Replaces first occurrence of <|image_pad|> with that many <|image_pad|> tokens
+    """
+    t, h, w = image_grid_thw
+    merge_area = merge_size ** 2
+    num_tokens = (t * h * w) // merge_area
+    
+    # Replace first occurrence of IMAGE_PAD_TOKEN with num_tokens copies
+    # (matches baseline behavior: replace once per image)
+    expanded_prompt = base_prompt.replace(IMAGE_PAD_TOKEN, IMAGE_PAD_TOKEN * num_tokens, 1)
+    
+    return expanded_prompt, num_tokens
+
+
+def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float, sample_dir: Path | None = None,
+                  enable_qnn: bool = False, qnn_backend_path: str | None = None, qnn_provider_library: str | None = None,
+                  do_sample: bool = False, min_length: int = 0, repetition_penalty: float = 1.0, tool_call_prompt: bool = False):
+    if not config_dir.is_dir():
+        raise FileNotFoundError(f"Config directory not found: {config_dir}")
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+
+    # 1. Load raw pixel values (B, C, H, W) for GenAI vision pipeline
+    pixel_values, grid_thw_array = load_prepatched_embeddings(image_path)
+    grid_thw = grid_thw_array[0]  # Extract (t, h, w) tuple
+    merge_size = MERGE_SIZE
+    
+    # DEBUG: Run vision pipeline manually to compare embeddings with baseline
+    # run_vision_pipeline_debug(pixel_values, config_dir)
+    
+    # NOTE: GenAI will automatically run the vision pipeline (patch_embed -> vision_attn -> patch_merger)
+    # when pixel_values are provided. No need to run it manually.
+    
+    # 2. Load model & tokenizer FIRST (needed for apply_chat_template)
+    # Optionally register and prioritize QNN EP for vision attention acceleration.
+    if enable_qnn:
+        # Dynamically register QNN EP provider library if supplied (needed when ORT not built with QNN statically)
+        if qnn_provider_library:
+            og.register_execution_provider_library("QNN", qnn_provider_library)
+        cfg_path = (config_dir / "genai_config.json") if (config_dir / "genai_config.json").is_file() else config_dir
+        cfg = og.Config(str(cfg_path))
+        cfg.append_provider("QNN")
+        cfg.append_provider("CPUExecutionProvider")  # fallback for non-attention stages
+        if qnn_backend_path:
+            cfg.set_provider_option("QNN", "backend_path", qnn_backend_path)
+        # Burst performance mode (3) if not overridden
+        cfg.set_provider_option("QNN", "performance_mode", "3")
+        model = og.Model(cfg)
+    else:
+        model = og.Model(str(config_dir))  # expects genai_config.json inside
+    
+    # Use HuggingFace tokenizer instead of ORT tokenizer to match baseline behavior
+    tokenizer_hf = AutoTokenizer.from_pretrained(str(config_dir), trust_remote_code=True)
+    
+    # Also create ORT tokenizer for streaming decode during generation
+    tokenizer_ort = og.Tokenizer(model)
+
+    # Build prompt AFTER tokenizer is loaded (needed for apply_chat_template)
+    if sample_dir is not None:
+        base_prompt, image_count_in_sample = build_prompt_from_sample(sample_dir / "sample.json", use_tool_call_prompt=tool_call_prompt, tokenizer=tokenizer_hf)
+        prompt, num_image_tokens = expand_image_tokens_in_prompt(base_prompt, grid_thw, merge_size)
+    else:
+        num_image_tokens = expand_image_tokens(grid_thw, merge_size)
+        prompt = build_prompt(prompt_text, num_image_tokens, use_tool_call_prompt=tool_call_prompt)
+
+    # Verify image token id exists
+    image_token_id = tokenizer_hf.convert_tokens_to_ids(IMAGE_PAD_TOKEN)
+    if image_token_id is None:
+        raise RuntimeError(f"Image token {IMAGE_PAD_TOKEN} not found in tokenizer")
+    
+    # Encode using HuggingFace tokenizer
+    input_ids_list = tokenizer_hf.encode(prompt)
+    input_ids_np = np.array(input_ids_list, dtype=np.int32)
+
+    # Sanity check: count occurrences
+    occurrences = int(np.sum(input_ids_np == image_token_id))
+    if occurrences != num_image_tokens:
+        print(f"[WARN] Token count mismatch: expected {num_image_tokens}, tokenizer found {occurrences}")
+
+    # 4. Prepare generation params (respect model context_length)
+    params = og.GeneratorParams(model)
+    # Fetch context_length from config if available; else default to max_new_tokens
+    context_len = None
+    try:
+        cfg_file2 = config_dir / "genai_config.json" if config_dir.is_dir() else config_dir
+        with open(cfg_file2, "r", encoding="utf-8") as f2:
+            cfgj = json.load(f2)
+            mdl = cfgj.get("model", {}) if isinstance(cfgj, dict) else {}
+            if isinstance(mdl, dict) and "context_length" in mdl:
+                context_len = int(mdl["context_length"])
+    except Exception:
+        context_len = None
+    # Align with reference semantics:
+    # - Use model context capacity for total max_length (prefill + generation)
+    # - Cap number of generated tokens separately via loop counter
+    total_capacity = int(context_len) if context_len else int(input_ids_np.shape[0] + max_new_tokens)
+    gen_cap = int(max_new_tokens)
+    params.set_search_options(max_length=total_capacity, temperature=temperature, top_k=top_k, top_p=top_p,
+                              do_sample=bool(do_sample), min_length=int(min_length), repetition_penalty=float(repetition_penalty))
+
+    generator = og.Generator(model, params)
+
+    # 5. Set pixel_values as input - GenAI will automatically run vision pipeline
+    # Note: pixel_values is pre-patched format (1, num_patches, patch_dim), matching baseline
+    pixel_values_f32 = np.ascontiguousarray(pixel_values.astype(np.float32))
+    generator.set_model_input("pixel_values", pixel_values_f32)
+    generator.set_model_input("image_grid_thw", np.ascontiguousarray(grid_thw_array.astype(np.int64)))
+
+    # 6. Append textual tokens (chunked to satisfy context model input length)
+    input_ids_i32 = input_ids_np.astype(np.int32)
+    # Do not truncate the prompt; the runtime will process it in windows
+    # according to the configured sliding window and chunk size.
+    # Log final prefill window preview
+    try:
+        preview_len = min(50, input_ids_i32.shape[0])
+    except Exception as e:
+        print(f"[DEBUG] Failed to decode preview: {e}")
+    # Append the full prompt once to avoid QNN continuous decoding constraints
+    generator.append_tokens(input_ids_i32)
+
+    # 7. Stream generation
+    stream = tokenizer_ort.create_stream()
+    output_tokens = []
+    print("\n=== Generating ===")
+    # Stream generation; rely on runtime-managed sequence lengths
+    step_idx = 0
+    # Read EOS token id(s) from config for early stop
+    eos_ids = []
+    try:
+        cfg_file_eos = config_dir / "genai_config.json" if config_dir.is_dir() else config_dir
+        with open(cfg_file_eos, "r", encoding="utf-8") as f_eos:
+            cfgj_eos = json.load(f_eos)
+            mdl = cfgj_eos.get("model", {}) if isinstance(cfgj_eos, dict) else {}
+            if isinstance(mdl, dict):
+                eos_val = mdl.get("eos_token_id")
+                if isinstance(eos_val, list):
+                    eos_ids = [int(x) for x in eos_val]
+                elif isinstance(eos_val, (int, float)):
+                    eos_ids = [int(eos_val)]
+    except Exception:
+        eos_ids = []
+    
+    # Tool-call mode state tracking
+    accum_text = ""
+    started_toolcall = False
+    closed_toolcall = False
+    
+    while not generator.is_done():
+        try:
+            generator.generate_next_token()
+        except Exception as gen_err:
+            print(f"[ERROR] generate_next_token failed at step {step_idx}: {gen_err}")
+            raise
+        new_tok = generator.get_next_tokens()[0]
+        output_tokens.append(new_tok)
+        # Stop on EOS after min_length tokens
+        if eos_ids and int(new_tok) in eos_ids and step_idx >= int(min_length):
+            break
+        decoded_piece = stream.decode(new_tok)
+        
+        if tool_call_prompt:
+            # In tool-call mode: buffer text and only print from <tool_call> onwards
+            accum_text += decoded_piece
+            if not started_toolcall:
+                if "<tool_call>" in accum_text:
+                    started_toolcall = True
+                    idx = accum_text.index("<tool_call>")
+                    sys.stdout.write(accum_text[idx:])
+                    sys.stdout.flush()
+            else:
+                # Already started printing tool_call region
+                sys.stdout.write(decoded_piece)
+                sys.stdout.flush()
+                if "</tool_call>" in accum_text:
+                    closed_toolcall = True
+                    print(f"\n[DEBUG] tool_call closed at step {step_idx}")
+                    break
+        else:
+            # Normal mode: print everything
+            if decoded_piece:
+                sys.stdout.write(decoded_piece)
+                sys.stdout.flush()
+        
+        # print(f"\n[DEBUG] Gen step {step_idx}: token_id={new_tok} decoded='{decoded_piece}'")
+        step_idx += 1
+        if step_idx >= gen_cap:
+            break
+    
+    print("\n=== Generation Complete ===")
+
+    full_output = tokenizer_hf.decode(np.array(output_tokens, dtype=np.int32))
+    
+    # Report whether tool_call was successfully emitted
+    if tool_call_prompt and not closed_toolcall:
+        print("\n[WARNING] Model did not emit complete <tool_call>...</tool_call> structure.")
+        print("[WARNING] Consider adjusting prompt, temperature, or sampling parameters.")
+    
+    # Write to file instead of stdout to handle Unicode characters
+    # with open("generation_output.txt", "w", encoding="utf-8") as f:
+    #     f.write("\n[FINAL OUTPUT]\n" + full_output)
+    print("\n FINAL OUTPUT: ", full_output)
+    return full_output
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen2.5-VL inference using onnxruntime-genai pipeline")
+    # Support both --config_dir (current) and legacy --model_path name.
+    parser.add_argument("--config_dir", type=Path, help="Directory containing genai_config.json for qwen2_5_vl (or use --model_path)")
+    parser.add_argument("--model_path", type=Path, help="Alias for --config_dir (legacy)")
+    parser.add_argument("--image", type=Path, required=True, help="Path to input image")
+    parser.add_argument("--prompt", type=str, help="User text prompt; if omitted and --sample_dir provided, sample conversation is used")
+    parser.add_argument("--max_new_tokens", type=int, default=4096)
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--do_sample", action="store_true", help="Enable sampling to reduce repetition")
+    parser.add_argument("--min_length", type=int, default=0, help="Minimum generated tokens before allowing EOS")
+    parser.add_argument("--repetition_penalty", type=float, default=1.0, help=">1.0 discourages repetition")
+    parser.add_argument("--sample_dir", type=Path, help="Optional dataset sample directory (contains sample.json & image)")
+    parser.add_argument("--enable_qnn", action="store_true", help="Enable QNN execution provider (vision attention acceleration)")
+    parser.add_argument("--qnn_backend_path", type=str, default="QnnHtp.dll", help="Path to QNN backend (e.g., QnnHtp.dll)")
+    parser.add_argument("--qnn_provider_library", type=str, help="Path to onnxruntime QNN EP shared library (e.g., onnxruntime_providers_qnn.dll)")
+    parser.add_argument("--tool_call_prompt", action="store_true", help="Enable tool-call mode: use baseline tools schema and emit <tool_call> XML")
+    args = parser.parse_args()
+
+    # Resolve config directory
+    config_dir = args.config_dir or args.model_path
+    if not config_dir:
+        parser.error("One of --config_dir or --model_path is required.")
+
+    # Determine prompt text
+    if args.prompt:
+        prompt_text = args.prompt
+    elif args.sample_dir is not None:
+        # Will be built from sample.json
+        prompt_text = ""  # placeholder; not used when sample_dir provided
+    else:
+        prompt_text = "Describe the image."  # default fallback
+
+    run_inference(
+        config_dir=config_dir,
+        image_path=args.image,
+        prompt_text=prompt_text,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        do_sample=args.do_sample,
+        min_length=args.min_length,
+        repetition_penalty=args.repetition_penalty,
+        sample_dir=args.sample_dir,
+        enable_qnn=args.enable_qnn,
+        qnn_backend_path=args.qnn_backend_path if args.enable_qnn else None,
+        qnn_provider_library=args.qnn_provider_library if args.enable_qnn else None,
+        tool_call_prompt=args.tool_call_prompt,
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/src/config.cpp b/src/config.cpp
index 7087819d86..85706eac6b 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -281,8 +281,6 @@ struct DecoderInputs_Element : JSON::Element {
       v_.embeddings = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
-    } else if (name == "position_ids") {
-      v_.position_ids = JSON::Get<std::string_view>(value);
     } else if (name == "past_key_names") {
       v_.past_key_names = JSON::Get<std::string_view>(value);
     } else if (name == "past_value_names") {
@@ -320,7 +318,7 @@ struct DecoderInputs_Element : JSON::Element {
     }
   }
 
- private:
+private:
   Config::Model::Decoder::Inputs& v_;
 };
 
@@ -587,6 +585,11 @@ struct Decoder_Element : JSON::Element {
       v_.sliding_window = Config::Model::Decoder::SlidingWindow{};
       return sliding_window_;
     }
+    // Support object-style pipeline: "pipeline": { "embeddings": { ... }, ... }
+    if (name == "pipeline") {
+      pipeline_object_ = std::make_unique<PipelineModelObject_Element>(v_.pipeline);
+      return *pipeline_object_;
+    }
     throw JSON::unknown_value_error{};
   }
 
@@ -605,6 +608,7 @@ struct Decoder_Element : JSON::Element {
   DecoderOutputs_Element outputs_{v_.outputs};
   Pipeline_Element pipeline_{v_.pipeline};
   SlidingWindow_Element sliding_window_{v_.sliding_window};
+  std::unique_ptr<PipelineModelObject_Element> pipeline_object_; // object-style pipeline support
 };
 
 struct VisionInputs_Element : JSON::Element {
@@ -615,6 +619,8 @@ struct VisionInputs_Element : JSON::Element {
       v_.pixel_values = JSON::Get<std::string_view>(value);
     } else if (name == "image_sizes") {
       v_.image_sizes = JSON::Get<std::string_view>(value);
+    } else if (name == "image_grid_thw") { // accept alternate naming, map to image_sizes
+      v_.image_sizes = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
     } else {
@@ -641,6 +647,94 @@ struct VisionOutputs_Element : JSON::Element {
   Config::Model::Vision::Outputs& v_;
 };
 
+// Vision pipeline support structures
+struct VisionPipelineModel_Element : JSON::Element {
+  explicit VisionPipelineModel_Element(Config::Model::Vision::PipelineModel& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    if (name == "filename") {
+      v_.filename = JSON::Get<std::string_view>(value);
+    } else if (name == "run_on_cpu") {
+      v_.run_on_cpu = JSON::Get<bool>(value);
+    } else {
+      throw JSON::unknown_value_error{};
+    }
+  }
+
+  Element& OnObject(std::string_view name) override {
+    if (name == "session_options") {
+      v_.session_options = Config::SessionOptions{};
+      session_options_ = std::make_unique<SessionOptions_Element>(*v_.session_options);
+      return *session_options_;
+    }
+    if (name == "run_options") {
+      v_.run_options = Config::RunOptions{};
+      run_options_ = std::make_unique<RunOptions_Element>(*v_.run_options);
+      return *run_options_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+  Element& OnArray(std::string_view name) override {
+    if (name == "inputs") {
+      return inputs_;
+    }
+    if (name == "outputs") {
+      return outputs_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+ private:
+  Config::Model::Vision::PipelineModel& v_;
+  std::unique_ptr<SessionOptions_Element> session_options_;
+  std::unique_ptr<RunOptions_Element> run_options_;
+  StringArray_Element inputs_{v_.inputs};
+  StringArray_Element outputs_{v_.outputs};
+};
+
+struct VisionPipelineModelObject_Element : JSON::Element {
+  explicit VisionPipelineModelObject_Element(std::vector<Config::Model::Vision::PipelineModel>& v) : v_{v} {}
+
+  Element& OnObject(std::string_view name) override {
+    auto& model = v_.emplace_back();
+    model.model_id = name;
+    elements_.emplace_back(model);
+    return elements_.back();
+  }
+
+ private:
+  std::vector<Config::Model::Vision::PipelineModel>& v_;
+  std::vector<VisionPipelineModel_Element> elements_;
+};
+
+struct VisionPipeline_Element : JSON::Element {
+  explicit VisionPipeline_Element(std::vector<Config::Model::Vision::PipelineModel>& v) : v_{v} {}
+
+  Element& OnObject(std::string_view name) override { return object_; }
+
+ private:
+  std::vector<Config::Model::Vision::PipelineModel>& v_;
+  VisionPipelineModelObject_Element object_{v_};
+};
+
+struct WindowIndexing_Element : JSON::Element {
+  explicit WindowIndexing_Element(Config::Model::Vision::WindowIndexing& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    if (name == "filename") {
+      v_.filename = JSON::Get<std::string_view>(value);
+    } else if (name == "spatial_merge_size") {
+      v_.spatial_merge_size = static_cast<int>(JSON::Get<double>(value));
+    } else {
+      throw JSON::unknown_value_error{};
+    }
+  }
+
+ private:
+  Config::Model::Vision::WindowIndexing& v_;
+};
+
 struct Vision_Element : JSON::Element {
   explicit Vision_Element(Config::Model::Vision& v) : v_{v} {}
 
@@ -673,6 +767,23 @@ struct Vision_Element : JSON::Element {
     if (name == "outputs") {
       return outputs_;
     }
+    if (name == "window_indexing") {
+      v_.window_indexing = Config::Model::Vision::WindowIndexing{};
+      window_indexing_element_ = std::make_unique<WindowIndexing_Element>(*v_.window_indexing);
+      return *window_indexing_element_;
+    }
+    // Support object-style pipeline for vision: "pipeline": { "patch_embed": { ... }, ... }
+    if (name == "pipeline") {
+      vision_pipeline_object_ = std::make_unique<VisionPipelineModelObject_Element>(v_.pipeline);
+      return *vision_pipeline_object_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+  Element& OnArray(std::string_view name) override {
+    if (name == "pipeline") {
+      return pipeline_element_;
+    }
     throw JSON::unknown_value_error{};
   }
 
@@ -682,6 +793,9 @@ struct Vision_Element : JSON::Element {
   std::unique_ptr<RunOptions_Element> run_options_;
   VisionInputs_Element inputs_{v_.inputs};
   VisionOutputs_Element outputs_{v_.outputs};
+  std::unique_ptr<WindowIndexing_Element> window_indexing_element_;
+  VisionPipeline_Element pipeline_element_{v_.pipeline};
+  std::unique_ptr<VisionPipelineModelObject_Element> vision_pipeline_object_; // object-style pipeline support
 };
 
 struct SpeechInputs_Element : JSON::Element {
@@ -856,6 +970,8 @@ struct Model_Element : JSON::Element {
       v_.decoder_start_token_id = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "sep_token_id") {
       v_.sep_token_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "image_token_id") {
+      v_.image_token_id = static_cast<int>(JSON::Get<double>(value));
     } else {
       throw JSON::unknown_value_error{};
     }
@@ -1212,19 +1328,14 @@ void ClearDecoderProviderOptionsHardwareVendorId(Config& config, std::string_vie
 struct Root_Element : JSON::Element {
   explicit Root_Element(Config& config) : config_{config} {}
 
-  void OnValue(std::string_view name, JSON::Value value) override {
+  void OnValue(std::string_view /*name*/, JSON::Value /*value*/) override {
+    // No top-level scalar values currently supported
   }
 
   Element& OnObject(std::string_view name) override {
-    if (name == "model") {
-      return model_element_;
-    }
-    if (name == "search") {
-      return search_element_;
-    }
-    if (name == "engine") {
-      return engine_element_;
-    }
+    if (name == "model") return model_element_;
+    if (name == "search") return search_element_;
+    if (name == "engine") return engine_element_;
     throw JSON::unknown_value_error{};
   }
 
diff --git a/src/config.h b/src/config.h
index 507d7c80c1..4163156eb5 100644
--- a/src/config.h
+++ b/src/config.h
@@ -159,6 +159,24 @@ struct Config {
       std::string config_filename{"processor_config.json"};
       std::optional<std::string> adapter_filename{};
 
+      // Vision pipeline support (patch embed -> vision attn -> patch merger)
+      struct PipelineModel {
+        std::string filename;
+        std::optional<SessionOptions> session_options;
+        std::optional<RunOptions> run_options;
+        std::string model_id;               // Identifier used to link outputs to subsequent stages
+        std::vector<std::string> inputs;    // Graph input names
+        std::vector<std::string> outputs;   // Graph output names
+        bool run_on_cpu{true};              // If true force CPU EP when multiple EPs are configured
+      };
+      std::vector<PipelineModel> pipeline;  // Ordered pipeline models
+
+      struct WindowIndexing {
+        std::string filename;       // Path to wnd_idx.npy
+        int spatial_merge_size{};   // Spatial merge size used for window expansion
+      };
+      std::optional<WindowIndexing> window_indexing; // Optional window indexing configuration
+
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
@@ -263,6 +281,8 @@ struct Config {
       std::vector<PipelineModel> pipeline;
 
     } decoder;
+    // Multi-modal token ids
+    int image_token_id{};  // Image pad token id used for embedding injection
 
   } model;
 
diff --git a/src/generators.cpp b/src/generators.cpp
index 85cd5cd26a..c16cbfdd50 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -318,14 +318,15 @@ DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t>
 
   auto input_ids_device = state_->params_->p_device->Allocate<int32_t>(padded_input_ids_size);
   auto cpu_span = input_ids_device.CpuSpan();
-  auto padding_begin = cpu_span.begin();
-  auto data_end = cpu_span.end();
-  if (model_->config_->model.decoder.sliding_window.has_value() && model_->config_->model.decoder.sliding_window->alignment == "left") {
-    padding_begin = cpu_span.begin() + input_ids.size();
-    data_end = padding_begin;
+  
+  // For sliding windows during prompt processing:
+  // - Copy actual tokens starting at position 0
+  // - Fill remaining positions with padding
+  // The alignment setting affects KV cache behavior, not token placement
+  std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin());
+  if (padded_input_ids_size > input_ids.size()) {
+    std::fill(cpu_span.begin() + input_ids.size(), cpu_span.end(), model_->config_->model.pad_token_id);
   }
-  std::fill_n(padding_begin, padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id);
-  std::copy_backward(input_ids.begin(), input_ids.end(), data_end);
   input_ids_device.CopyCpuToDevice();
   return input_ids_device;
 }
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index ac12572a0c..4aa0cf533d 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -16,9 +16,9 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, DeviceSpan<
     : State{params, model},
       model_{model},
       kv_cache_(CreateKeyValueCache(*this)),
-      position_inputs_{model, *this, sequence_lengths_unk, model_.config_->model.decoder.inputs.attention_mask} {
+      position_inputs_{CreatePositionInputs(*this, sequence_lengths_unk, model_.config_->model.decoder.inputs.attention_mask)} {
   input_ids_.Add();
-  position_inputs_.Add();
+  position_inputs_->Add();
   logits_.Add();
   kv_cache_->Add();
 }
@@ -79,15 +79,22 @@ DeviceSpan<float> DecoderOnly_State::RunWithChunking(int total_length, DeviceSpa
 }
 
 void DecoderOnly_State::RewindTo(size_t index) {
-  position_inputs_.RewindTo(index);
+  position_inputs_->RewindTo(index);
   kv_cache_->RewindTo(index);
 }
 
 void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> beam_indices, int total_length) {
   input_ids_.Update(next_tokens);
   size_t new_length = static_cast<size_t>(input_ids_.GetShape()[1]);
-  position_inputs_.Update(next_tokens, total_length, static_cast<int>(new_length));
-  kv_cache_->Update(beam_indices, total_length);
+  // Clamp KV cache length to sliding window size if configured
+  int effective_total_length = total_length;
+  if (model_.config_->model.decoder.sliding_window.has_value() &&
+      model_.config_->model.decoder.sliding_window->window_size > 0) {
+    effective_total_length = std::min(effective_total_length, model_.config_->model.decoder.sliding_window->window_size);
+  }
+
+  position_inputs_->Update(next_tokens, effective_total_length, static_cast<int>(new_length));
+  kv_cache_->Update(beam_indices, effective_total_length);
   logits_.Update(next_tokens, new_length);
 }
 
diff --git a/src/models/decoder_only.h b/src/models/decoder_only.h
index a61fb2b8be..0869756ae2 100644
--- a/src/models/decoder_only.h
+++ b/src/models/decoder_only.h
@@ -36,7 +36,7 @@ struct DecoderOnly_State : State {
   DefaultInputIDs input_ids_{*this};
   Logits logits_{*this};
   std::unique_ptr<KeyValueCache> kv_cache_;
-  DefaultPositionInputs position_inputs_;
+  std::unique_ptr<PositionInputs> position_inputs_;
   ExtraInputs extra_inputs_{*this};
 };
 
diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp
index 6e996192b9..15d5c89638 100644
--- a/src/models/decoder_only_pipeline.cpp
+++ b/src/models/decoder_only_pipeline.cpp
@@ -112,8 +112,8 @@ DecoderOnlyPipelineState::DecoderOnlyPipelineState(const DecoderOnlyPipelineMode
                                                    DeviceSpan<int32_t> sequence_lengths,
                                                    const GeneratorParams& params)
     : State{params, model},
-      model_{model},
       input_ids_{CreateInputIDs(*this)},
+      model_{model},
       key_value_cache_{CreateKeyValueCache(*this)},
       do_key_value_cache_partial_update_{key_value_cache_ && key_value_cache_->IsPartialUpdateSupported()},
       position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} {
@@ -205,7 +205,7 @@ void DecoderOnlyPipelineState::RunPipeline(int total_length, DeviceSpan<int32_t>
     } else if (!first_run_ && !model_.config_->model.decoder.pipeline[pipeline_state->id_].run_on_token_gen) {
       continue;
     }
-
+    
     DurationTrace trace{MakeString("DecoderOnlyPipelineState::RunPipeline[", pipeline_state->id_, "]")};
 
     if (model_.config_->model.decoder.pipeline[pipeline_state->id_].reset_session_idx > -1) {
diff --git a/src/models/decoder_only_pipeline.h b/src/models/decoder_only_pipeline.h
index 70160be432..17c173e711 100644
--- a/src/models/decoder_only_pipeline.h
+++ b/src/models/decoder_only_pipeline.h
@@ -69,6 +69,17 @@ struct DecoderOnlyPipelineState : State {
   void RunPipeline(int total_length, DeviceSpan<int32_t>& next_tokens,
                    DeviceSpan<int32_t> next_indices, bool is_last_chunk);
 
+ protected:
+  // Virtual hook called after each pipeline stage completes, before next stage starts.
+  // Allows derived classes to modify stage outputs (e.g., inject vision embeddings).
+  // stage_id: ID of the stage that just completed
+  // next_tokens: current input tokens for pipeline
+  virtual void OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {}
+
+  // Stores all the outputs from the previous pipeline state(s)
+  std::unordered_map<std::string, std::unique_ptr<OrtValue>> ortvalue_store_;
+  std::unique_ptr<InputIDs> input_ids_;  // Made protected for derived class access
+
  private:
   void UpdateKeyValueCache(DeviceSpan<int32_t> beam_indices, int total_length);
 
@@ -86,10 +97,6 @@ struct DecoderOnlyPipelineState : State {
   std::map<size_t, size_t> pipeline_state_id_to_partial_kv_cache_update_record_idx_;
   std::vector<PartialKeyValueCacheUpdateRecord> partial_kv_cache_update_records_;
 
-  // Stores all the outputs from the previous pipeline state(s)
-  std::unordered_map<std::string, std::unique_ptr<OrtValue>> ortvalue_store_;
-
-  std::unique_ptr<InputIDs> input_ids_;
   Logits logits_{*this};
 
   std::unique_ptr<KeyValueCache> key_value_cache_;
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index dbdbd828d6..451a4c0f10 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -202,7 +202,8 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
       // Uniform sliding window allocation (backward compatibility)
       shape_[2] = std::min(max_length, sliding_window_size);
     }
-  } else if (past_present_share_buffer_) {
+  } else {
+    // Default capacity: use requested max_length regardless of buffer sharing
     shape_[2] = state_.params_->search.max_length;
   }
 
@@ -270,25 +271,18 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
   }
 
   if (!layer_shapes_.empty()) {
-    // Update per-layer shapes based on total_length, but respect max allocations
+    // Allocate present tensors to full per-layer capacity; runtime uses effective length internally
     for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
-      const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
-      const int actual_length = std::min(total_length, max_cache_length);
-
-      std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
-      current_shape[2] = actual_length;
-
+      const std::array<int64_t, 4> capacity_shape = layer_shapes_[layer_idx];
       // Key tensor
-      presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
+      presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), capacity_shape, type_);
       state_.outputs_[output_index_ + layer_idx * 2] = presents_[layer_idx * 2].get();
-
       // Value tensor
-      presents_[layer_idx * 2 + 1] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
+      presents_[layer_idx * 2 + 1] = OrtValue::CreateTensor(Allocator(), capacity_shape, type_);
       state_.outputs_[output_index_ + layer_idx * 2 + 1] = presents_[layer_idx * 2 + 1].get();
     }
   } else {
-    // Uniform shape update (existing behavior)
-    shape_[2] = total_length;
+    // Uniform capacity allocation (shape_[2] set at construction to max_length)
     for (int i = 0; i < layer_count_ * 2; i++) {
       presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
       state_.outputs_[output_index_ + i] = presents_[i].get();
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 77e6c82657..59eef435ed 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -19,6 +19,7 @@
 #include "multi_modal.h"
 #include "marian.h"
 #include "decoder_only_pipeline.h"
+#include "qwen_vl_model.h"
 #include "../dml/interface.h"
 
 #if defined(_WIN32)
@@ -1193,6 +1194,8 @@ std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, con
 }
 
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, std::unique_ptr<Config> config) {
+  if (config->model.type == "qwen2_5_vl")
+    return std::make_shared<Qwen2_5_VL_PipelineModel>(std::move(config), ort_env);
   if (config->model.type == "gpt2")
     return std::make_shared<Gpt_Model>(std::move(config), ort_env);
   if (ModelType::IsLLM(config->model.type))
diff --git a/src/models/model_type.h b/src/models/model_type.h
index c7c4d2f691..8a71e0e105 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,7 +18,7 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 2> VLM = {"gemma3", "phi3v"};
+    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "qwen2_5_vl"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
new file mode 100644
index 0000000000..c2269036c3
--- /dev/null
+++ b/src/models/qwen_vl_model.cpp
@@ -0,0 +1,314 @@
+#include "qwen_vl_model.h"
+#include "model.h"
+#include "onnxruntime_api.h"
+#include "../logging.h"
+#include <iostream>
+#include <cstring>
+
+namespace Generators {
+
+Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
+  : DecoderOnlyPipelineModel(std::move(config), ort_env) {  
+  // Build vision pipeline if config provides vision pipeline stages
+  if (!config_->model.vision.pipeline.empty() && config_->model.vision.window_indexing.has_value()) {
+    // Expect identifiers patch_embed, vision_attn, patch_merger
+    std::string patch_embed_path, vision_attn_path, patch_merger_path;
+    for (const auto& stage : config_->model.vision.pipeline) {
+      if (stage.model_id == "patch_embed") patch_embed_path = (config_->config_path / fs::path(stage.filename)).string();
+      else if (stage.model_id == "vision_attn") vision_attn_path = (config_->config_path / fs::path(stage.filename)).string();
+      else if (stage.model_id == "patch_merger") patch_merger_path = (config_->config_path / fs::path(stage.filename)).string();
+    }
+    if (!patch_embed_path.empty() && !vision_attn_path.empty() && !patch_merger_path.empty()) {
+      auto wnd_idx_path = (config_->config_path / fs::path(config_->model.vision.window_indexing->filename)).string();
+      int spatial_merge = config_->model.vision.window_indexing->spatial_merge_size;
+      // For now, rely on run_on_cpu flag of vision_attn stage to decide QNN usage
+      bool use_qnn_attn = false;
+      for (const auto& stage : config_->model.vision.pipeline) {
+        if (stage.model_id == "vision_attn" && !stage.run_on_cpu) {
+          use_qnn_attn = true; break;
+        }
+      }      
+      vision_pipeline_ = std::make_unique<QwenVisionPipeline>(ort_env,
+                                                              patch_embed_path,
+                                                              vision_attn_path,
+                                                              patch_merger_path,
+                                                              spatial_merge,
+                                                              wnd_idx_path,
+                                                              use_qnn_attn);
+    } else {
+      std::cout << "[GENAI VISION] WARNING: Missing vision model paths!" << std::endl;
+    }
+  } else {
+    std::cout << "[GENAI VISION] No vision pipeline config found" << std::endl;
+  }
+}
+
+std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
+                                                             const GeneratorParams& params) const {
+  return std::make_unique<Qwen2_5_VL_PipelineState>(*this, sequence_lengths, params);
+}
+
+Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
+                                                   DeviceSpan<int32_t> sequence_lengths,
+                                                   const GeneratorParams& params)
+  : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} {
+}
+
+void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
+  // Let base register provided extra inputs first
+  DecoderOnlyPipelineState::SetExtraInputs(extra_inputs);
+  
+  if (vision_ran_) {
+    return;
+  }
+  
+  if (!vl_model_.vision_pipeline_) {
+    return;
+  }
+
+  // Find pixel_values input among the extra inputs passed to this function
+  OrtValue* pixel_values_val = nullptr;
+  const std::string pixel_name = vl_model_.config_->model.vision.inputs.pixel_values;
+  
+  for (size_t i = 0; i < extra_inputs.size(); ++i) {
+    if (extra_inputs[i].name == pixel_name) {
+      pixel_values_val = extra_inputs[i].tensor->GetOrtTensor();
+      break;
+    }
+  }
+  
+  if (!pixel_values_val) {
+    return;
+  }
+
+  auto pixel_info = pixel_values_val->GetTensorTypeAndShapeInfo();
+  auto pixel_shape = pixel_info->GetShape();
+  std::vector<int64_t> pixel_shape_vec(pixel_shape.begin(), pixel_shape.end());
+  
+  size_t pixel_count = 1;
+  for (auto d : pixel_shape_vec) pixel_count *= static_cast<size_t>(d);
+  // Get pointer - but don't access it yet to avoid crash
+  const float* pixel_data = nullptr;
+  try {
+    pixel_data = pixel_values_val->GetTensorMutableData<float>();
+  } catch (const std::exception& ex) {
+    std::cout << "[GENAI VISION] ERROR: Failed to get pixel data pointer: " << ex.what() << std::endl;
+    return;
+  }
+  
+  if (!pixel_data) {
+    return;
+  }
+
+  // Run vision pipeline
+  try {
+    image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec);
+  } catch (const std::exception& ex) {
+    std::cout << "[GENAI VISION] ERROR: Vision pipeline run failed: " << ex.what() << std::endl;
+    return;
+  }
+
+  auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape(); // [seq_len, hidden]
+  if (out_shape.size() != 2) {
+    return;
+  }
+  
+  // Debug: Log vision embeddings statistics
+  float min_feat = image_features_buffer_[0], max_feat = image_features_buffer_[0], sum_feat = 0.0f;
+  for (const auto& val : image_features_buffer_) {
+    min_feat = std::min(min_feat, val);
+    max_feat = std::max(max_feat, val);
+    sum_feat += val;
+  }
+  float mean_feat = sum_feat / image_features_buffer_.size();
+  float sum_sq_diff = 0.0f;
+  for (const auto& val : image_features_buffer_) {
+    float diff = val - mean_feat;
+    sum_sq_diff += diff * diff;
+  }
+  size_t count = static_cast<size_t>(out_shape[0]) * static_cast<size_t>(out_shape[1]);
+  auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+  // API expects spans (data, shape)
+  std::span<float> data_span(image_features_buffer_.data(), count);
+  std::span<const int64_t> shape_span(out_shape.data(), out_shape.size());
+  image_features_value_ = OrtValue::CreateTensor<float>(*mem_info, data_span, shape_span);
+
+  vision_ran_ = true;
+}
+
+void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
+  // After embeddings stage (stage 0) completes, inject vision embeddings at image token positions
+  if (stage_id == 0 && vision_ran_) {    
+    // Find embeddings output name from config
+    const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
+    if (!embeddings_config.outputs.empty()) {
+      const std::string& embeddings_output_name = embeddings_config.outputs[0];      
+      InjectVisionEmbeddings(embeddings_output_name, next_tokens);
+    }
+  }
+}
+
+void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
+                                                     DeviceSpan<int32_t>& input_token_ids) {  
+  // Get image_token_id from config
+  const int32_t image_token_id = vl_model_.config_->model.image_token_id;
+  
+  // Get embeddings output from ortvalue_store_
+  auto it = ortvalue_store_.find(embeddings_output_name);
+  if (it == ortvalue_store_.end()) {
+    return;
+  }
+  
+  OrtValue* embeddings_ortvalue = it->second.get();
+  if (!embeddings_ortvalue) {
+    return;
+  }
+  
+  //Get tensor info
+  auto type_info = embeddings_ortvalue->GetTensorTypeAndShapeInfo();
+  auto shape = type_info->GetShape();
+  float* embeddings_data = embeddings_ortvalue->GetTensorMutableData<float>();
+  
+  const int64_t embedding_dim = shape[2];
+  
+  // Get vision embeddings info
+  auto vision_type_info = image_features_value_->GetTensorTypeAndShapeInfo();
+  auto vision_shape = vision_type_info->GetShape();
+  const float* vision_data = image_features_value_->GetTensorData<float>();
+  
+  const int64_t num_vision_tokens = vision_shape[0];
+  const int64_t vision_dim = vision_shape[1];
+  
+  if (vision_dim != embedding_dim) {
+    return;
+  }
+  
+  // Get input_ids from the base class member
+  if (!input_ids_ || !input_ids_->Get()) {
+    return;
+  }
+  
+  OrtValue* input_ids_ortvalue = input_ids_->Get();
+  auto input_ids_type_info = input_ids_ortvalue->GetTensorTypeAndShapeInfo();
+  auto input_ids_shape = input_ids_type_info->GetShape();
+  const int32_t* token_ids_cpu = input_ids_ortvalue->GetTensorData<int32_t>();
+  
+  // Log input token IDs
+  int64_t total_tokens = 1;
+  for (auto dim : input_ids_shape) total_tokens *= dim;
+  // std::cout << "[GENAI EMB INPUT] Token IDs count: " << total_tokens << std::endl;
+  // std::cout << "[GENAI EMB INPUT] First 20 token IDs: ";
+  // for (int i = 0; i < std::min(20LL, total_tokens); ++i) {
+  //   std::cout << token_ids_cpu[i] << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "[GENAI EMB INPUT] Last 20 token IDs: ";
+  // int64_t start_idx = total_tokens > 20 ? total_tokens - 20 : 0;
+  // for (int64_t i = start_idx; i < total_tokens; ++i) {
+  //   std::cout << token_ids_cpu[i] << " ";
+  // }
+  // std::cout << std::endl;
+  
+//   std::cout << "[GENAI INJECT] input_ids shape: [";
+  // for (size_t i = 0; i < input_ids_shape.size(); ++i) {
+  //   std::cout << input_ids_shape[i];
+  //   if (i < input_ids_shape.size() - 1) std::cout << ", ";
+  // }
+//   std::cout << "]" << std::endl;
+  
+  // Print first few token IDs for debugging
+//   std::cout << "[GENAI INJECT] First 20 token IDs: ";
+  // for (int i = 0; i < std::min(20LL, total_tokens); ++i) {
+  //   std::cout << token_ids_cpu[i] << " ";
+  // }
+  // std::cout << std::endl;
+  
+  size_t num_image_tokens_in_chunk = 0;
+  
+  // // Iterate through input_ids to find image token positions
+  // size_t num_image_tokens_found = 0;
+  // for (int64_t i = 0; i < total_tokens; ++i) {
+  //   if (token_ids_cpu[i] == image_token_id) {
+  //     num_image_tokens_found++;
+  //   }
+  // }
+  
+  for (int64_t i = 0; i < total_tokens; ++i) {
+    if (token_ids_cpu[i] == image_token_id) {
+      // Found image token position - replace with vision embedding
+      if (image_embed_consumed_ < static_cast<size_t>(num_vision_tokens)) {
+        const float* src_vision_embedding = vision_data + (image_embed_consumed_ * vision_dim);
+        
+        // Map from input_ids position to embeddings position
+        // Embeddings shape is [batch, seq_len, embedding_dim]
+        // input_ids shape could be [batch, seq_len] or [seq_len]
+        int64_t embed_idx = i;
+        if (shape.size() == 3 && input_ids_shape.size() == 2) {
+          // If embeddings has batch dimension but we're in flattened input_ids
+          embed_idx = i;  // Assume batch=1, just use linear index
+        }
+        float* dst_text_embedding = embeddings_data + (embed_idx * embedding_dim);
+        
+        // Debug: Print first injection
+        // if (num_image_tokens_in_chunk == 0) {
+        // //   std::cout << "[GENAI INJECT] First injection: position " << i << " in input_ids, embedding index " << embed_idx << std::endl;
+        // //   std::cout << "[GENAI INJECT] Vision embedding [0-5]: ";
+        //   for (int k = 0; k < 5; ++k) std::cout << src_vision_embedding[k] << " ";
+        //   std::cout << std::endl;
+        // //   std::cout << "[GENAI INJECT] Original text embedding [0-5]: ";
+        //   for (int k = 0; k < 5; ++k) std::cout << dst_text_embedding[k] << " ";
+        //   std::cout << std::endl;
+        // }
+        
+        // Copy vision embedding to this position
+        std::memcpy(dst_text_embedding, src_vision_embedding, vision_dim * sizeof(float));
+        
+        // Verify the write
+        // if (num_image_tokens_in_chunk == 0) {
+        // //   std::cout << "[GENAI INJECT] After copy, embedding [0-5]: ";
+        //   for (int k = 0; k < 5; ++k) std::cout << dst_text_embedding[k] << " ";
+        //   std::cout << std::endl;
+        // }
+        
+        num_image_tokens_in_chunk++;
+        image_embed_consumed_++;
+      } else {
+        std::cout << "[GENAI INJECT] WARNING: More image tokens than vision embeddings!" << std::endl;
+      }
+    }
+  }
+  
+//   std::cout << "[GENAI INJECT] Injected " << num_image_tokens_in_chunk << " vision embeddings at image token positions" << std::endl;
+//   std::cout << "[GENAI INJECT] Total consumed: " << image_embed_consumed_ << " / " << num_vision_tokens << std::endl;
+  
+  // Verify embeddings after injection
+  float min_after = std::numeric_limits<float>::max();
+  float max_after = std::numeric_limits<float>::lowest();
+  float sum_after = 0.0f;
+  int64_t total_elems = 1;
+  for (auto dim : shape) total_elems *= dim;
+  for (int64_t i = 0; i < total_elems; ++i) {
+    float val = embeddings_data[i];
+    if (val < min_after) min_after = val;
+    if (val > max_after) max_after = val;
+    sum_after += val;
+  }
+  std::cout << "[GENAI INJECT] Embeddings AFTER injection: min=" << min_after << ", max=" << max_after << ", mean=" << (sum_after / total_elems) << std::endl;
+  
+  // Log embeddings AFTER injection with first 10 values
+  std::cout << "[GENAI EMB AFTER INJECTION] Shape: [";
+  for (size_t i = 0; i < shape.size(); ++i) {
+    std::cout << shape[i];
+    if (i < shape.size() - 1) std::cout << ", ";
+  }
+  std::cout << "]" << std::endl;
+  std::cout << "[GENAI EMB AFTER INJECTION] Statistics: min=" << min_after << ", max=" << max_after << ", mean=" << (sum_after / total_elems) << std::endl;
+  std::cout << "[GENAI EMB AFTER INJECTION] First 10 values: ";
+  for (int i = 0; i < 10 && i < total_elems; ++i) {
+    std::cout << embeddings_data[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+} // namespace Generators
+
diff --git a/src/models/qwen_vl_model.h b/src/models/qwen_vl_model.h
new file mode 100644
index 0000000000..27f12649c3
--- /dev/null
+++ b/src/models/qwen_vl_model.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "decoder_only_pipeline.h"
+#include "qwen_vl_vision.h"
+
+namespace Generators {
+
+// Qwen2.5-VL pipeline model integrating vision pipeline + decoder pipeline.
+// Loads decoder pipeline sessions (handled by base) and constructs vision pipeline sessions.
+// State runs vision once (on first SetExtraInputs when pixel_values arrives) to produce image_features
+// which are injected into embeddings output via existing injection logic in DecoderOnlyPipelineState.
+struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel {
+  Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env);
+
+  std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths,
+                                     const GeneratorParams& params) const override;
+
+  // Vision pipeline shared across states (sessions reused).
+  std::unique_ptr<QwenVisionPipeline> vision_pipeline_;
+};
+
+struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
+  Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
+                           DeviceSpan<int32_t> sequence_lengths,
+                           const GeneratorParams& params);
+
+  void SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) override;
+
+ protected:
+  void OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) override;
+
+ private:
+  void InjectVisionEmbeddings(const std::string& embeddings_output_name,
+                             DeviceSpan<int32_t>& input_token_ids);
+  
+  const Qwen2_5_VL_PipelineModel& vl_model_;
+  bool vision_ran_{false};
+  std::unique_ptr<OrtValue> image_features_value_;
+  std::vector<float> image_features_buffer_; // backing storage for OrtValue
+  size_t image_embed_consumed_{0}; // Track how many vision embeddings we've injected
+};
+
+} // namespace Generators
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
new file mode 100644
index 0000000000..b1c6968a03
--- /dev/null
+++ b/src/models/qwen_vl_vision.cpp
@@ -0,0 +1,303 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+// Qwen VL Vision pipeline implementation with optional QNN EP for vision attention stage.
+
+#include "qwen_vl_vision.h"
+
+#include <fstream>
+#include <stdexcept>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+
+namespace Generators {
+
+// Minimal .npy reader for 1D integer arrays.
+// Only handles C-order, little-endian, shape (N,), for dtypes '<i4' or '<i8'.
+std::vector<int64_t> Load1DNpyIndices(const std::string& file_path) {
+  std::ifstream fin(file_path, std::ios::binary);
+  if (!fin) throw std::runtime_error("Failed to open npy file: " + file_path);
+
+  // Read magic string
+  char magic[6];
+  fin.read(magic, 6);
+  if (std::strncmp(magic, "\x93NUMPY", 6) != 0) {
+    throw std::runtime_error("Invalid npy header (magic mismatch) for: " + file_path);
+  }
+  // Version
+  unsigned char ver_major; unsigned char ver_minor;
+  fin.read(reinterpret_cast<char*>(&ver_major), 1);
+  fin.read(reinterpret_cast<char*>(&ver_minor), 1);
+  uint16_t header_len_le;
+  fin.read(reinterpret_cast<char*>(&header_len_le), 2); // little endian
+  const uint16_t header_len = header_len_le;
+  std::string header(header_len, '\0');
+  fin.read(header.data(), header_len);
+
+  auto find_field = [&](const std::string& key) {
+    auto pos = header.find(key);
+    if (pos == std::string::npos) return std::string();
+    return header.substr(pos, header.size() - pos);
+  };
+
+  // dtype
+  auto descr_pos = header.find("'descr':");
+  if (descr_pos == std::string::npos) throw std::runtime_error("Missing 'descr' in npy header");
+  auto descr_start = header.find("'", descr_pos + 8);
+  auto descr_end = header.find("'", descr_start + 1);
+  std::string dtype = header.substr(descr_start + 1, descr_end - descr_start - 1);
+  bool is_int32 = (dtype == "<i4");
+  bool is_int64 = (dtype == "<i8");
+  if (!is_int32 && !is_int64) throw std::runtime_error("Unsupported dtype in npy (expected <i4 or <i8): " + dtype);
+
+  auto shape_pos = header.find("'shape':");
+  if (shape_pos == std::string::npos) throw std::runtime_error("Missing 'shape' in npy header");
+  auto paren_start = header.find("(", shape_pos);
+  auto paren_end = header.find(")", paren_start);
+  std::string shape_str = header.substr(paren_start + 1, paren_end - paren_start - 1);
+  // shape like "1234," or "1234" depending on version
+  shape_str.erase(std::remove(shape_str.begin(), shape_str.end(), ' '), shape_str.end());
+  if (shape_str.empty()) throw std::runtime_error("Empty shape in npy header");
+  if (shape_str.back() == ',') shape_str.pop_back();
+  int64_t N = std::stoll(shape_str);
+  if (N <= 0) throw std::runtime_error("Invalid shape size in npy header");
+
+  std::vector<int64_t> result;
+  result.resize(static_cast<size_t>(N));
+
+  if (is_int32) {
+    std::vector<int32_t> tmp(N);
+    fin.read(reinterpret_cast<char*>(tmp.data()), N * sizeof(int32_t));
+    if (fin.gcount() != static_cast<std::streamsize>(N * sizeof(int32_t))) throw std::runtime_error("Unexpected EOF reading npy data");
+    for (int64_t i = 0; i < N; ++i) result[static_cast<size_t>(i)] = static_cast<int64_t>(tmp[static_cast<size_t>(i)]);
+  } else {
+    fin.read(reinterpret_cast<char*>(result.data()), N * sizeof(int64_t));
+    if (fin.gcount() != static_cast<std::streamsize>(N * sizeof(int64_t))) throw std::runtime_error("Unexpected EOF reading npy data");
+  }
+  return result;
+}
+
+QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
+                                       const std::string& patch_embed_model,
+                                       const std::string& vision_attn_model,
+                                       const std::string& patch_merger_model,
+                                       int64_t spatial_merge_size,
+                                       const std::string& wnd_idx_path,
+                                       bool use_qnn_attn,
+                                       const std::string& qnn_backend_path)
+  // Match declaration order to avoid MSVC C5038 warning-as-error
+  : use_qnn_attn_(use_qnn_attn),
+    qnn_backend_path_(qnn_backend_path),
+    spatial_merge_size_(spatial_merge_size),
+    env_(env) {
+
+  // Convert std::string model paths to ORTCHAR_T for cross-platform (char or wchar_t)
+  auto toOrtPath = [](const std::string& s) -> std::basic_string<ORTCHAR_T> {
+    return std::basic_string<ORTCHAR_T>(s.begin(), s.end());
+  };
+  auto pe_path = toOrtPath(patch_embed_model);
+  auto attn_path = toOrtPath(vision_attn_model);
+  auto merger_path = toOrtPath(patch_merger_model);
+
+  // Patch embed and patch merger sessions (CPU for now)
+  patch_embed_session_ = OrtSession::Create(env_, pe_path.c_str(), nullptr);
+  patch_merger_session_ = OrtSession::Create(env_, merger_path.c_str(), nullptr);
+
+  if (use_qnn_attn_) {
+    // Ensure QNN provider is available
+    auto providers = Ort::GetAvailableProviders();
+    bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
+    if (!has_qnn) {
+      throw std::runtime_error("QNNExecutionProvider requested for vision attention but not available in this build");
+    }
+    auto so = OrtSessionOptions::Create();
+    so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
+    // QNN provider options
+    const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
+    const char* values[] = { qnn_backend_path_.c_str(), "burst", "3", "60" };
+    so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
+    vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), so.get());
+  } else {
+    vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), nullptr);
+  }
+
+  wnd_idx_ = Load1DNpyIndices(wnd_idx_path);
+  // Build reverse index (argsort)
+  rev_idx_.resize(wnd_idx_.size());
+  std::vector<std::pair<int64_t, size_t>> pairs;
+  pairs.reserve(wnd_idx_.size());
+  for (size_t i = 0; i < wnd_idx_.size(); ++i) pairs.emplace_back(wnd_idx_[i], i);
+  std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b){ return a.first < b.first; });
+  for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
+}
+
+std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
+  auto memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+  std::span<float> data_span(const_cast<float*>(data), count);
+  std::span<const int64_t> shape_span(shape.data(), shape.size());
+  return OrtValue::CreateTensor<float>(*memory_info, data_span, shape_span);
+}
+
+// Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store).
+
+std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape) {
+  if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
+    throw std::runtime_error("Vision pipeline sessions not initialized");
+  }
+  // Create input tensor for patch embed
+  size_t pixel_count = 1;
+  for (auto d : pixel_shape) pixel_count *= static_cast<size_t>(d);
+  
+  auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape);
+  
+  const char* pe_input_names[] = {"pixel_values"};
+  OrtValue* pe_inputs[] = { pixel_tensor.get() };
+
+  // Compute expected output shape based on input
+  // Input: [batch=1, num_patches, patch_dim]
+  // Output: [num_patches, hidden_dim=1280]
+  int64_t num_patches = pixel_shape[1];  // 1972
+  int64_t hidden_dim = 1280;  // Qwen2.5-VL hidden dimension
+  std::vector<int64_t> pe_out_shape_vec{num_patches, hidden_dim};
+  size_t pe_out_count = static_cast<size_t>(num_patches * hidden_dim);
+  
+  std::vector<float> pe_out_buf(pe_out_count);
+  auto pe_out_tensor = CreateTensor(pe_out_buf.data(), pe_out_count, pe_out_shape_vec);
+  
+  // Prepare output name
+  auto pe_out_name_str = patch_embed_session_->GetOutputName(0);
+  const char* pe_output_names[] = { pe_out_name_str.c_str() };
+  OrtValue* pe_outputs[] = { pe_out_tensor.get() };
+
+  patch_embed_session_->Run(nullptr, pe_input_names, pe_inputs, 1, pe_output_names, pe_outputs, 1);
+
+  // Debug: Log patch_embed output
+  float min_pe = pe_out_buf[0], max_pe = pe_out_buf[0], sum_pe = 0.0f;
+  for (const auto& val : pe_out_buf) {
+    min_pe = std::min(min_pe, val);
+    max_pe = std::max(max_pe, val);
+    sum_pe += val;
+  }
+
+  // hidden now in pe_out_buf with shape [seq_len, hidden_size]
+  int64_t seq_len = pe_out_shape_vec[0];
+  int64_t hidden_size = pe_out_shape_vec[1];
+  int64_t window_area = spatial_merge_size_ * spatial_merge_size_;
+  if (seq_len % window_area != 0) {
+    throw std::runtime_error("Sequence length not divisible by spatial_merge_size^2 in vision pipeline");
+  }
+  int64_t num_windows = seq_len / window_area;
+  // Reshape logically: [num_windows, window_area, hidden_size] then reorder by wnd_idx_
+  if (static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
+    throw std::runtime_error("wnd_idx size does not match number of windows");
+  }
+
+  // Temporary buffer for reordered hidden
+  std::vector<float> reordered(seq_len * hidden_size);
+  // For each window index w: copy its window_area * hidden_size block in order
+  for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
+    int64_t src_w = wnd_idx_[dst_w];
+    if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
+    // source offset in original flattened: src_w * window_area * hidden_size
+    size_t src_offset = static_cast<size_t>(src_w) * static_cast<size_t>(window_area) * static_cast<size_t>(hidden_size);
+    size_t dst_offset = static_cast<size_t>(dst_w) * static_cast<size_t>(window_area) * static_cast<size_t>(hidden_size);
+    std::memcpy(reordered.data() + dst_offset, pe_out_buf.data() + src_offset,
+                window_area * static_cast<size_t>(hidden_size) * sizeof(float));
+  }
+
+  float min_wnd = reordered[0], max_wnd = reordered[0], sum_wnd = 0.0f;
+  for (const auto& val : reordered) {
+    min_wnd = std::min(min_wnd, val);
+    max_wnd = std::max(max_wnd, val);
+    sum_wnd += val;
+  }
+
+  // Flatten reordered is still [seq_len, hidden_size]
+  std::vector<int64_t> attn_in_shape{seq_len, hidden_size};
+  auto attn_in_tensor = CreateTensor(reordered.data(), reordered.size(), attn_in_shape);
+  const char* attn_input_names[] = {"hidden"};
+  OrtValue* attn_inputs[] = { attn_in_tensor.get() };
+
+  // Prepare attention output - shape should be same as input
+  std::vector<int64_t> attn_out_shape_vec{seq_len, hidden_size};
+  size_t attn_out_count = static_cast<size_t>(seq_len * hidden_size);
+  std::vector<float> attn_out_buf(attn_out_count);
+  auto attn_out_tensor = CreateTensor(attn_out_buf.data(), attn_out_count, attn_out_shape_vec);
+  auto attn_out_name_str = vision_attn_session_->GetOutputName(0);
+  const char* attn_output_names[] = { attn_out_name_str.c_str() };
+  OrtValue* attn_outputs[] = { attn_out_tensor.get() };
+  
+  vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1);
+
+  float min_attn = attn_out_buf[0], max_attn = attn_out_buf[0], sum_attn = 0.0f;
+  for (const auto& val : attn_out_buf) {
+    min_attn = std::min(min_attn, val);
+    max_attn = std::max(max_attn, val);
+    sum_attn += val;
+  }
+  // Merger input (attention output)
+  auto merger_in_tensor = CreateTensor(attn_out_buf.data(), attn_out_buf.size(), attn_out_shape_vec);
+  const char* merger_input_names[] = {"hidden"};
+  OrtValue* merger_inputs[] = { merger_in_tensor.get() };
+  
+  // Patch merger output shape: [seq_len / 4, 3584] 
+  // The merger reduces spatial dimensions and projects to final vision hidden size
+  int64_t merged_seq_len = seq_len / (spatial_merge_size_ * spatial_merge_size_);
+  int64_t merged_hidden_size = 3584;  // Qwen2.5-VL final vision embedding dimension
+  std::vector<int64_t> merger_out_shape_vec{merged_seq_len, merged_hidden_size};
+  size_t merger_out_count = static_cast<size_t>(merged_seq_len * merged_hidden_size);
+  std::vector<float> merger_out_buf(merger_out_count);
+  auto merger_out_tensor = CreateTensor(merger_out_buf.data(), merger_out_count, merger_out_shape_vec);
+  auto merger_out_name_str = patch_merger_session_->GetOutputName(0);
+  const char* merger_output_names[] = { merger_out_name_str.c_str() };
+  OrtValue* merger_outputs[] = { merger_out_tensor.get() };
+  
+  patch_merger_session_->Run(nullptr, merger_input_names, merger_inputs, 1, merger_output_names, merger_outputs, 1);
+
+  float min_merger = merger_out_buf[0], max_merger = merger_out_buf[0], sum_merger = 0.0f;
+  for (const auto& val : merger_out_buf) {
+    min_merger = std::min(min_merger, val);
+    max_merger = std::max(max_merger, val);
+    sum_merger += val;
+  }
+
+  // Final reverse ordering using rev_idx_ (argsort of wnd_idx). Expect same number of windows mapping.
+  // Merger output shape assumed [num_windows * window_area, hidden_size] or potentially [num_windows, hidden_size].
+  // After merger, sequence length is reduced by spatial_merge_size^2
+  if (merger_out_shape_vec.size() != 2) {
+    throw std::runtime_error("Patch merger output must be rank-2");
+  }
+  int64_t final_seq_len = merger_out_shape_vec[0];  // 493 (merged)
+  int64_t final_hidden = merger_out_shape_vec[1];     // 3584 (merged)
+  
+  // Validate final dimensions match expected after merging
+  if (final_seq_len != merged_seq_len) {
+    throw std::runtime_error("Unexpected final sequence length after merger");
+  }
+  if (final_hidden != merged_hidden_size) {
+    throw std::runtime_error("Final hidden size mismatch after merger");
+  }
+  if (static_cast<int64_t>(rev_idx_.size()) != num_windows) {
+    // Each window maps back; reorder at window granularity.
+    throw std::runtime_error("rev_idx size does not match number of windows");
+  }
+
+  // Apply reverse indexing at merged window granularity
+  // After merging, we have merged_seq_len tokens, one per original window
+  std::vector<float> final_embeddings(merger_out_buf.size());
+  for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
+    int64_t src_w = rev_idx_[dst_w];
+    // Each "window" in merged output is now just 1 token with merged_hidden_size features
+    size_t src_offset = static_cast<size_t>(src_w) * static_cast<size_t>(final_hidden);
+    size_t dst_offset = static_cast<size_t>(dst_w) * static_cast<size_t>(final_hidden);
+    std::memcpy(final_embeddings.data() + dst_offset, merger_out_buf.data() + src_offset,
+                static_cast<size_t>(final_hidden) * sizeof(float));
+  }
+
+  // Save final shape
+  last_seq_len_ = final_seq_len;
+  last_hidden_size_ = final_hidden;
+  return final_embeddings; // shape: [final_seq_len=493, final_hidden=3584]
+}
+
+} // namespace Generators
diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h
new file mode 100644
index 0000000000..64d2171096
--- /dev/null
+++ b/src/models/qwen_vl_vision.h
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+// Qwen VL Vision pipeline support (initial skeleton).
+// Executes three ONNX models in sequence:
+//   1) Patch Embedding  : pixel_values -> hidden
+//   2) Vision Attention : hidden -> hidden
+//   3) Patch Merger      : hidden -> merged embeddings
+// Performs window expansion/reordering using wnd_idx, then final reverse ordering.
+//
+// This is a minimal starting point to integrate Qwen2.5-VL vision processing
+// into onnxruntime-genai. Further work will: (a) connect to Config parsing,
+// (b) expose via MultiModal pipeline, (c) add EP selection, (d) reuse buffers.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstdint>
+
+#include "onnxruntime_api.h"
+
+namespace Generators {
+
+// Simple loader for a 1D numpy .npy file containing integer indices.
+// Supports little-endian int32/int64 arrays of shape (N,).
+std::vector<int64_t> Load1DNpyIndices(const std::string& file_path);
+
+// Internal vision pipeline (no external DLL interface required after Python binding removal).
+struct QwenVisionPipeline {
+  QwenVisionPipeline(OrtEnv& env,
+                     const std::string& patch_embed_model,
+                     const std::string& vision_attn_model,
+                     const std::string& patch_merger_model,
+                     int64_t spatial_merge_size,
+                     const std::string& wnd_idx_path,
+                     bool use_qnn_attn = false,
+                     const std::string& qnn_backend_path = "QnnHtp.dll");
+  bool use_qnn_attn_{};
+  std::string qnn_backend_path_{};
+
+  QwenVisionPipeline(const QwenVisionPipeline&) = delete;
+  QwenVisionPipeline& operator=(const QwenVisionPipeline&) = delete;
+
+  // Run vision pipeline.
+  // pixel_values: float32 tensor with shape [S, C] or [B, C, H, W] depending on export (caller provides shape).
+  // The ONNX model is assumed to accept the provided shape directly as 'pixel_values'.
+  // Returns final merged embeddings (shape: [num_image_tokens, hidden_size]).
+  std::vector<float> Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape);
+
+  // Shape info from last Run (seq_len, hidden_size). Returns empty vector if Run not called yet.
+  std::vector<int64_t> GetLastOutputShape() const {
+    if (last_seq_len_ <= 0 || last_hidden_size_ <= 0) return {};
+    return {last_seq_len_, last_hidden_size_};
+  }
+
+  // Accessors
+  const std::vector<int64_t>& GetWndIdx() const { return wnd_idx_; }
+  int64_t GetSpatialMergeSize() const { return spatial_merge_size_; }
+
+ private:
+  // Internal helpers
+  std::unique_ptr<OrtValue> CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const;
+
+  std::unique_ptr<OrtSession> patch_embed_session_;
+  std::unique_ptr<OrtSession> vision_attn_session_;
+  std::unique_ptr<OrtSession> patch_merger_session_;
+
+  std::vector<int64_t> wnd_idx_;  // window reordering indices
+  std::vector<int64_t> rev_idx_;  // reverse ordering indices (argsort of wnd_idx)
+  int64_t spatial_merge_size_{};
+  OrtEnv& env_;
+  int64_t last_seq_len_{0};
+  int64_t last_hidden_size_{0};
+};
+
+} // namespace Generators

From 0baab8d6950432fc72dfe8f3c0725b100d967e5d Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 13:20:50 -0800
Subject: [PATCH 02/25] Cleanup

---
 src/models/qwen_vl_model.cpp  | 292 +++++++---------------------------
 src/models/qwen_vl_vision.cpp | 154 +++++-------------
 2 files changed, 100 insertions(+), 346 deletions(-)

diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index c2269036c3..673a484171 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -9,38 +9,37 @@ namespace Generators {
 
 Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
   : DecoderOnlyPipelineModel(std::move(config), ort_env) {  
-  // Build vision pipeline if config provides vision pipeline stages
-  if (!config_->model.vision.pipeline.empty() && config_->model.vision.window_indexing.has_value()) {
-    // Expect identifiers patch_embed, vision_attn, patch_merger
-    std::string patch_embed_path, vision_attn_path, patch_merger_path;
+  if (config_->model.vision.pipeline.empty() || !config_->model.vision.window_indexing.has_value()) return;
+
+  // Find vision pipeline stage paths
+  auto find_stage = [&](const std::string& id) -> std::string {
     for (const auto& stage : config_->model.vision.pipeline) {
-      if (stage.model_id == "patch_embed") patch_embed_path = (config_->config_path / fs::path(stage.filename)).string();
-      else if (stage.model_id == "vision_attn") vision_attn_path = (config_->config_path / fs::path(stage.filename)).string();
-      else if (stage.model_id == "patch_merger") patch_merger_path = (config_->config_path / fs::path(stage.filename)).string();
+      if (stage.model_id == id) return (config_->config_path / fs::path(stage.filename)).string();
     }
-    if (!patch_embed_path.empty() && !vision_attn_path.empty() && !patch_merger_path.empty()) {
-      auto wnd_idx_path = (config_->config_path / fs::path(config_->model.vision.window_indexing->filename)).string();
-      int spatial_merge = config_->model.vision.window_indexing->spatial_merge_size;
-      // For now, rely on run_on_cpu flag of vision_attn stage to decide QNN usage
-      bool use_qnn_attn = false;
-      for (const auto& stage : config_->model.vision.pipeline) {
-        if (stage.model_id == "vision_attn" && !stage.run_on_cpu) {
-          use_qnn_attn = true; break;
-        }
-      }      
-      vision_pipeline_ = std::make_unique<QwenVisionPipeline>(ort_env,
-                                                              patch_embed_path,
-                                                              vision_attn_path,
-                                                              patch_merger_path,
-                                                              spatial_merge,
-                                                              wnd_idx_path,
-                                                              use_qnn_attn);
-    } else {
-      std::cout << "[GENAI VISION] WARNING: Missing vision model paths!" << std::endl;
+    return "";
+  };
+  
+  auto patch_embed_path = find_stage("patch_embed");
+  auto vision_attn_path = find_stage("vision_attn");
+  auto patch_merger_path = find_stage("patch_merger");
+  
+  if (patch_embed_path.empty() || vision_attn_path.empty() || patch_merger_path.empty()) return;
+
+  // Check if QNN should be used for vision attention
+  bool use_qnn_attn = false;
+  for (const auto& stage : config_->model.vision.pipeline) {
+    if (stage.model_id == "vision_attn" && !stage.run_on_cpu) {
+      use_qnn_attn = true;
+      break;
     }
-  } else {
-    std::cout << "[GENAI VISION] No vision pipeline config found" << std::endl;
   }
+
+  auto wnd_idx_path = (config_->config_path / fs::path(config_->model.vision.window_indexing->filename)).string();
+  int spatial_merge = config_->model.vision.window_indexing->spatial_merge_size;
+  
+  vision_pipeline_ = std::make_unique<QwenVisionPipeline>(
+    ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
+    spatial_merge, wnd_idx_path, use_qnn_attn);
 }
 
 std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
@@ -55,81 +54,37 @@ Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineMode
 }
 
 void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
-  // Let base register provided extra inputs first
   DecoderOnlyPipelineState::SetExtraInputs(extra_inputs);
   
-  if (vision_ran_) {
-    return;
-  }
-  
-  if (!vl_model_.vision_pipeline_) {
-    return;
-  }
+  if (vision_ran_ || !vl_model_.vision_pipeline_) return;
 
-  // Find pixel_values input among the extra inputs passed to this function
   OrtValue* pixel_values_val = nullptr;
-  const std::string pixel_name = vl_model_.config_->model.vision.inputs.pixel_values;
+  const auto& pixel_name = vl_model_.config_->model.vision.inputs.pixel_values;
   
-  for (size_t i = 0; i < extra_inputs.size(); ++i) {
-    if (extra_inputs[i].name == pixel_name) {
-      pixel_values_val = extra_inputs[i].tensor->GetOrtTensor();
+  for (const auto& input : extra_inputs) {
+    if (input.name == pixel_name) {
+      pixel_values_val = input.tensor->GetOrtTensor();
       break;
     }
   }
-  
-  if (!pixel_values_val) {
-    return;
-  }
+  if (!pixel_values_val) return;
 
-  auto pixel_info = pixel_values_val->GetTensorTypeAndShapeInfo();
-  auto pixel_shape = pixel_info->GetShape();
+  auto pixel_shape = pixel_values_val->GetTensorTypeAndShapeInfo()->GetShape();
   std::vector<int64_t> pixel_shape_vec(pixel_shape.begin(), pixel_shape.end());
-  
-  size_t pixel_count = 1;
-  for (auto d : pixel_shape_vec) pixel_count *= static_cast<size_t>(d);
-  // Get pointer - but don't access it yet to avoid crash
-  const float* pixel_data = nullptr;
-  try {
-    pixel_data = pixel_values_val->GetTensorMutableData<float>();
-  } catch (const std::exception& ex) {
-    std::cout << "[GENAI VISION] ERROR: Failed to get pixel data pointer: " << ex.what() << std::endl;
-    return;
-  }
-  
-  if (!pixel_data) {
-    return;
-  }
+  const float* pixel_data = pixel_values_val->GetTensorMutableData<float>();
+  if (!pixel_data) return;
 
-  // Run vision pipeline
   try {
     image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec);
-  } catch (const std::exception& ex) {
-    std::cout << "[GENAI VISION] ERROR: Vision pipeline run failed: " << ex.what() << std::endl;
-    return;
+  } catch (const std::exception&) {
+    return;  // Silent failure - pipeline already logs errors
   }
 
-  auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape(); // [seq_len, hidden]
-  if (out_shape.size() != 2) {
-    return;
-  }
+  auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape();
+  if (out_shape.size() != 2) return;
   
-  // Debug: Log vision embeddings statistics
-  float min_feat = image_features_buffer_[0], max_feat = image_features_buffer_[0], sum_feat = 0.0f;
-  for (const auto& val : image_features_buffer_) {
-    min_feat = std::min(min_feat, val);
-    max_feat = std::max(max_feat, val);
-    sum_feat += val;
-  }
-  float mean_feat = sum_feat / image_features_buffer_.size();
-  float sum_sq_diff = 0.0f;
-  for (const auto& val : image_features_buffer_) {
-    float diff = val - mean_feat;
-    sum_sq_diff += diff * diff;
-  }
-  size_t count = static_cast<size_t>(out_shape[0]) * static_cast<size_t>(out_shape[1]);
   auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
-  // API expects spans (data, shape)
-  std::span<float> data_span(image_features_buffer_.data(), count);
+  std::span<float> data_span(image_features_buffer_.data(), image_features_buffer_.size());
   std::span<const int64_t> shape_span(out_shape.data(), out_shape.size());
   image_features_value_ = OrtValue::CreateTensor<float>(*mem_info, data_span, shape_span);
 
@@ -137,177 +92,50 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
 }
 
 void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
-  // After embeddings stage (stage 0) completes, inject vision embeddings at image token positions
-  if (stage_id == 0 && vision_ran_) {    
-    // Find embeddings output name from config
-    const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
-    if (!embeddings_config.outputs.empty()) {
-      const std::string& embeddings_output_name = embeddings_config.outputs[0];      
-      InjectVisionEmbeddings(embeddings_output_name, next_tokens);
-    }
+  if (stage_id != 0 || !vision_ran_) return;
+  
+  const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
+  if (!embeddings_config.outputs.empty()) {
+    InjectVisionEmbeddings(embeddings_config.outputs[0], next_tokens);
   }
 }
 
 void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
-                                                     DeviceSpan<int32_t>& input_token_ids) {  
-  // Get image_token_id from config
-  const int32_t image_token_id = vl_model_.config_->model.image_token_id;
-  
-  // Get embeddings output from ortvalue_store_
+                                                     DeviceSpan<int32_t>& input_token_ids) {
   auto it = ortvalue_store_.find(embeddings_output_name);
-  if (it == ortvalue_store_.end()) {
-    return;
-  }
+  if (it == ortvalue_store_.end() || !it->second) return;
   
   OrtValue* embeddings_ortvalue = it->second.get();
-  if (!embeddings_ortvalue) {
-    return;
-  }
-  
-  //Get tensor info
-  auto type_info = embeddings_ortvalue->GetTensorTypeAndShapeInfo();
-  auto shape = type_info->GetShape();
+  auto shape = embeddings_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
   float* embeddings_data = embeddings_ortvalue->GetTensorMutableData<float>();
   
-  const int64_t embedding_dim = shape[2];
-  
-  // Get vision embeddings info
-  auto vision_type_info = image_features_value_->GetTensorTypeAndShapeInfo();
-  auto vision_shape = vision_type_info->GetShape();
+  auto vision_shape = image_features_value_->GetTensorTypeAndShapeInfo()->GetShape();
   const float* vision_data = image_features_value_->GetTensorData<float>();
   
+  const int64_t embedding_dim = shape[2];
   const int64_t num_vision_tokens = vision_shape[0];
   const int64_t vision_dim = vision_shape[1];
+  if (vision_dim != embedding_dim) return;
   
-  if (vision_dim != embedding_dim) {
-    return;
-  }
+  const int32_t image_token_id = vl_model_.config_->model.image_token_id;
   
-  // Get input_ids from the base class member
-  if (!input_ids_ || !input_ids_->Get()) {
-    return;
-  }
+  if (!input_ids_ || !input_ids_->Get()) return;
   
   OrtValue* input_ids_ortvalue = input_ids_->Get();
-  auto input_ids_type_info = input_ids_ortvalue->GetTensorTypeAndShapeInfo();
-  auto input_ids_shape = input_ids_type_info->GetShape();
+  auto input_ids_shape = input_ids_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
   const int32_t* token_ids_cpu = input_ids_ortvalue->GetTensorData<int32_t>();
   
-  // Log input token IDs
   int64_t total_tokens = 1;
   for (auto dim : input_ids_shape) total_tokens *= dim;
-  // std::cout << "[GENAI EMB INPUT] Token IDs count: " << total_tokens << std::endl;
-  // std::cout << "[GENAI EMB INPUT] First 20 token IDs: ";
-  // for (int i = 0; i < std::min(20LL, total_tokens); ++i) {
-  //   std::cout << token_ids_cpu[i] << " ";
-  // }
-  // std::cout << std::endl;
-  // std::cout << "[GENAI EMB INPUT] Last 20 token IDs: ";
-  // int64_t start_idx = total_tokens > 20 ? total_tokens - 20 : 0;
-  // for (int64_t i = start_idx; i < total_tokens; ++i) {
-  //   std::cout << token_ids_cpu[i] << " ";
-  // }
-  // std::cout << std::endl;
-  
-//   std::cout << "[GENAI INJECT] input_ids shape: [";
-  // for (size_t i = 0; i < input_ids_shape.size(); ++i) {
-  //   std::cout << input_ids_shape[i];
-  //   if (i < input_ids_shape.size() - 1) std::cout << ", ";
-  // }
-//   std::cout << "]" << std::endl;
-  
-  // Print first few token IDs for debugging
-//   std::cout << "[GENAI INJECT] First 20 token IDs: ";
-  // for (int i = 0; i < std::min(20LL, total_tokens); ++i) {
-  //   std::cout << token_ids_cpu[i] << " ";
-  // }
-  // std::cout << std::endl;
-  
-  size_t num_image_tokens_in_chunk = 0;
-  
-  // // Iterate through input_ids to find image token positions
-  // size_t num_image_tokens_found = 0;
-  // for (int64_t i = 0; i < total_tokens; ++i) {
-  //   if (token_ids_cpu[i] == image_token_id) {
-  //     num_image_tokens_found++;
-  //   }
-  // }
   
   for (int64_t i = 0; i < total_tokens; ++i) {
-    if (token_ids_cpu[i] == image_token_id) {
-      // Found image token position - replace with vision embedding
-      if (image_embed_consumed_ < static_cast<size_t>(num_vision_tokens)) {
-        const float* src_vision_embedding = vision_data + (image_embed_consumed_ * vision_dim);
-        
-        // Map from input_ids position to embeddings position
-        // Embeddings shape is [batch, seq_len, embedding_dim]
-        // input_ids shape could be [batch, seq_len] or [seq_len]
-        int64_t embed_idx = i;
-        if (shape.size() == 3 && input_ids_shape.size() == 2) {
-          // If embeddings has batch dimension but we're in flattened input_ids
-          embed_idx = i;  // Assume batch=1, just use linear index
-        }
-        float* dst_text_embedding = embeddings_data + (embed_idx * embedding_dim);
-        
-        // Debug: Print first injection
-        // if (num_image_tokens_in_chunk == 0) {
-        // //   std::cout << "[GENAI INJECT] First injection: position " << i << " in input_ids, embedding index " << embed_idx << std::endl;
-        // //   std::cout << "[GENAI INJECT] Vision embedding [0-5]: ";
-        //   for (int k = 0; k < 5; ++k) std::cout << src_vision_embedding[k] << " ";
-        //   std::cout << std::endl;
-        // //   std::cout << "[GENAI INJECT] Original text embedding [0-5]: ";
-        //   for (int k = 0; k < 5; ++k) std::cout << dst_text_embedding[k] << " ";
-        //   std::cout << std::endl;
-        // }
-        
-        // Copy vision embedding to this position
-        std::memcpy(dst_text_embedding, src_vision_embedding, vision_dim * sizeof(float));
-        
-        // Verify the write
-        // if (num_image_tokens_in_chunk == 0) {
-        // //   std::cout << "[GENAI INJECT] After copy, embedding [0-5]: ";
-        //   for (int k = 0; k < 5; ++k) std::cout << dst_text_embedding[k] << " ";
-        //   std::cout << std::endl;
-        // }
-        
-        num_image_tokens_in_chunk++;
-        image_embed_consumed_++;
-      } else {
-        std::cout << "[GENAI INJECT] WARNING: More image tokens than vision embeddings!" << std::endl;
-      }
+    if (token_ids_cpu[i] == image_token_id && image_embed_consumed_ < static_cast<size_t>(num_vision_tokens)) {
+      std::memcpy(embeddings_data + (i * embedding_dim), 
+                  vision_data + (image_embed_consumed_ * vision_dim),
+                  vision_dim * sizeof(float));
+      image_embed_consumed_++;
     }
   }
-  
-//   std::cout << "[GENAI INJECT] Injected " << num_image_tokens_in_chunk << " vision embeddings at image token positions" << std::endl;
-//   std::cout << "[GENAI INJECT] Total consumed: " << image_embed_consumed_ << " / " << num_vision_tokens << std::endl;
-  
-  // Verify embeddings after injection
-  float min_after = std::numeric_limits<float>::max();
-  float max_after = std::numeric_limits<float>::lowest();
-  float sum_after = 0.0f;
-  int64_t total_elems = 1;
-  for (auto dim : shape) total_elems *= dim;
-  for (int64_t i = 0; i < total_elems; ++i) {
-    float val = embeddings_data[i];
-    if (val < min_after) min_after = val;
-    if (val > max_after) max_after = val;
-    sum_after += val;
-  }
-  std::cout << "[GENAI INJECT] Embeddings AFTER injection: min=" << min_after << ", max=" << max_after << ", mean=" << (sum_after / total_elems) << std::endl;
-  
-  // Log embeddings AFTER injection with first 10 values
-  std::cout << "[GENAI EMB AFTER INJECTION] Shape: [";
-  for (size_t i = 0; i < shape.size(); ++i) {
-    std::cout << shape[i];
-    if (i < shape.size() - 1) std::cout << ", ";
-  }
-  std::cout << "]" << std::endl;
-  std::cout << "[GENAI EMB AFTER INJECTION] Statistics: min=" << min_after << ", max=" << max_after << ", mean=" << (sum_after / total_elems) << std::endl;
-  std::cout << "[GENAI EMB AFTER INJECTION] First 10 values: ";
-  for (int i = 0; i < 10 && i < total_elems; ++i) {
-    std::cout << embeddings_data[i] << " ";
-  }
-  std::cout << std::endl;
 }
 
 } // namespace Generators
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index b1c6968a03..e3a5eef8c5 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -144,160 +144,86 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
     throw std::runtime_error("Vision pipeline sessions not initialized");
   }
-  // Create input tensor for patch embed
+  
   size_t pixel_count = 1;
   for (auto d : pixel_shape) pixel_count *= static_cast<size_t>(d);
-  
   auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape);
   
   const char* pe_input_names[] = {"pixel_values"};
   OrtValue* pe_inputs[] = { pixel_tensor.get() };
 
-  // Compute expected output shape based on input
-  // Input: [batch=1, num_patches, patch_dim]
-  // Output: [num_patches, hidden_dim=1280]
-  int64_t num_patches = pixel_shape[1];  // 1972
-  int64_t hidden_dim = 1280;  // Qwen2.5-VL hidden dimension
-  std::vector<int64_t> pe_out_shape_vec{num_patches, hidden_dim};
-  size_t pe_out_count = static_cast<size_t>(num_patches * hidden_dim);
+  const int64_t num_patches = pixel_shape[1];
+  const int64_t hidden_dim = 1280;
+  std::vector<int64_t> pe_out_shape{num_patches, hidden_dim};
+  std::vector<float> pe_out_buf(num_patches * hidden_dim);
+  auto pe_out_tensor = CreateTensor(pe_out_buf.data(), pe_out_buf.size(), pe_out_shape);
   
-  std::vector<float> pe_out_buf(pe_out_count);
-  auto pe_out_tensor = CreateTensor(pe_out_buf.data(), pe_out_count, pe_out_shape_vec);
-  
-  // Prepare output name
-  auto pe_out_name_str = patch_embed_session_->GetOutputName(0);
-  const char* pe_output_names[] = { pe_out_name_str.c_str() };
+  auto pe_out_name = patch_embed_session_->GetOutputName(0);
+  const char* pe_output_names[] = { pe_out_name.c_str() };
   OrtValue* pe_outputs[] = { pe_out_tensor.get() };
 
   patch_embed_session_->Run(nullptr, pe_input_names, pe_inputs, 1, pe_output_names, pe_outputs, 1);
 
-  // Debug: Log patch_embed output
-  float min_pe = pe_out_buf[0], max_pe = pe_out_buf[0], sum_pe = 0.0f;
-  for (const auto& val : pe_out_buf) {
-    min_pe = std::min(min_pe, val);
-    max_pe = std::max(max_pe, val);
-    sum_pe += val;
-  }
-
-  // hidden now in pe_out_buf with shape [seq_len, hidden_size]
-  int64_t seq_len = pe_out_shape_vec[0];
-  int64_t hidden_size = pe_out_shape_vec[1];
-  int64_t window_area = spatial_merge_size_ * spatial_merge_size_;
-  if (seq_len % window_area != 0) {
-    throw std::runtime_error("Sequence length not divisible by spatial_merge_size^2 in vision pipeline");
-  }
-  int64_t num_windows = seq_len / window_area;
-  // Reshape logically: [num_windows, window_area, hidden_size] then reorder by wnd_idx_
-  if (static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
-    throw std::runtime_error("wnd_idx size does not match number of windows");
+  const int64_t seq_len = num_patches;
+  const int64_t window_area = spatial_merge_size_ * spatial_merge_size_;
+  const int64_t num_windows = seq_len / window_area;
+  
+  if (seq_len % window_area != 0 || static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
+    throw std::runtime_error("Invalid window configuration for vision pipeline");
   }
 
-  // Temporary buffer for reordered hidden
-  std::vector<float> reordered(seq_len * hidden_size);
-  // For each window index w: copy its window_area * hidden_size block in order
+  std::vector<float> reordered(seq_len * hidden_dim);
   for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
     int64_t src_w = wnd_idx_[dst_w];
     if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
-    // source offset in original flattened: src_w * window_area * hidden_size
-    size_t src_offset = static_cast<size_t>(src_w) * static_cast<size_t>(window_area) * static_cast<size_t>(hidden_size);
-    size_t dst_offset = static_cast<size_t>(dst_w) * static_cast<size_t>(window_area) * static_cast<size_t>(hidden_size);
-    std::memcpy(reordered.data() + dst_offset, pe_out_buf.data() + src_offset,
-                window_area * static_cast<size_t>(hidden_size) * sizeof(float));
-  }
-
-  float min_wnd = reordered[0], max_wnd = reordered[0], sum_wnd = 0.0f;
-  for (const auto& val : reordered) {
-    min_wnd = std::min(min_wnd, val);
-    max_wnd = std::max(max_wnd, val);
-    sum_wnd += val;
+    size_t offset_size = window_area * hidden_dim;
+    std::memcpy(reordered.data() + dst_w * offset_size, 
+                pe_out_buf.data() + src_w * offset_size,
+                offset_size * sizeof(float));
   }
 
-  // Flatten reordered is still [seq_len, hidden_size]
-  std::vector<int64_t> attn_in_shape{seq_len, hidden_size};
-  auto attn_in_tensor = CreateTensor(reordered.data(), reordered.size(), attn_in_shape);
+  std::vector<int64_t> attn_shape{seq_len, hidden_dim};
+  auto attn_in_tensor = CreateTensor(reordered.data(), reordered.size(), attn_shape);
   const char* attn_input_names[] = {"hidden"};
   OrtValue* attn_inputs[] = { attn_in_tensor.get() };
 
-  // Prepare attention output - shape should be same as input
-  std::vector<int64_t> attn_out_shape_vec{seq_len, hidden_size};
-  size_t attn_out_count = static_cast<size_t>(seq_len * hidden_size);
-  std::vector<float> attn_out_buf(attn_out_count);
-  auto attn_out_tensor = CreateTensor(attn_out_buf.data(), attn_out_count, attn_out_shape_vec);
-  auto attn_out_name_str = vision_attn_session_->GetOutputName(0);
-  const char* attn_output_names[] = { attn_out_name_str.c_str() };
+  std::vector<float> attn_out_buf(seq_len * hidden_dim);
+  auto attn_out_tensor = CreateTensor(attn_out_buf.data(), attn_out_buf.size(), attn_shape);
+  auto attn_out_name = vision_attn_session_->GetOutputName(0);
+  const char* attn_output_names[] = { attn_out_name.c_str() };
   OrtValue* attn_outputs[] = { attn_out_tensor.get() };
   
   vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1);
 
-  float min_attn = attn_out_buf[0], max_attn = attn_out_buf[0], sum_attn = 0.0f;
-  for (const auto& val : attn_out_buf) {
-    min_attn = std::min(min_attn, val);
-    max_attn = std::max(max_attn, val);
-    sum_attn += val;
-  }
-  // Merger input (attention output)
-  auto merger_in_tensor = CreateTensor(attn_out_buf.data(), attn_out_buf.size(), attn_out_shape_vec);
+  auto merger_in_tensor = CreateTensor(attn_out_buf.data(), attn_out_buf.size(), attn_shape);
   const char* merger_input_names[] = {"hidden"};
   OrtValue* merger_inputs[] = { merger_in_tensor.get() };
   
-  // Patch merger output shape: [seq_len / 4, 3584] 
-  // The merger reduces spatial dimensions and projects to final vision hidden size
-  int64_t merged_seq_len = seq_len / (spatial_merge_size_ * spatial_merge_size_);
-  int64_t merged_hidden_size = 3584;  // Qwen2.5-VL final vision embedding dimension
-  std::vector<int64_t> merger_out_shape_vec{merged_seq_len, merged_hidden_size};
-  size_t merger_out_count = static_cast<size_t>(merged_seq_len * merged_hidden_size);
-  std::vector<float> merger_out_buf(merger_out_count);
-  auto merger_out_tensor = CreateTensor(merger_out_buf.data(), merger_out_count, merger_out_shape_vec);
-  auto merger_out_name_str = patch_merger_session_->GetOutputName(0);
-  const char* merger_output_names[] = { merger_out_name_str.c_str() };
+  const int64_t merged_seq_len = num_windows;  // One token per window after merging
+  const int64_t merged_hidden = 3584;
+  std::vector<int64_t> merger_shape{merged_seq_len, merged_hidden};
+  std::vector<float> merger_out_buf(merged_seq_len * merged_hidden);
+  auto merger_out_tensor = CreateTensor(merger_out_buf.data(), merger_out_buf.size(), merger_shape);
+  auto merger_out_name = patch_merger_session_->GetOutputName(0);
+  const char* merger_output_names[] = { merger_out_name.c_str() };
   OrtValue* merger_outputs[] = { merger_out_tensor.get() };
   
   patch_merger_session_->Run(nullptr, merger_input_names, merger_inputs, 1, merger_output_names, merger_outputs, 1);
 
-  float min_merger = merger_out_buf[0], max_merger = merger_out_buf[0], sum_merger = 0.0f;
-  for (const auto& val : merger_out_buf) {
-    min_merger = std::min(min_merger, val);
-    max_merger = std::max(max_merger, val);
-    sum_merger += val;
-  }
-
-  // Final reverse ordering using rev_idx_ (argsort of wnd_idx). Expect same number of windows mapping.
-  // Merger output shape assumed [num_windows * window_area, hidden_size] or potentially [num_windows, hidden_size].
-  // After merger, sequence length is reduced by spatial_merge_size^2
-  if (merger_out_shape_vec.size() != 2) {
-    throw std::runtime_error("Patch merger output must be rank-2");
-  }
-  int64_t final_seq_len = merger_out_shape_vec[0];  // 493 (merged)
-  int64_t final_hidden = merger_out_shape_vec[1];     // 3584 (merged)
-  
-  // Validate final dimensions match expected after merging
-  if (final_seq_len != merged_seq_len) {
-    throw std::runtime_error("Unexpected final sequence length after merger");
-  }
-  if (final_hidden != merged_hidden_size) {
-    throw std::runtime_error("Final hidden size mismatch after merger");
-  }
   if (static_cast<int64_t>(rev_idx_.size()) != num_windows) {
-    // Each window maps back; reorder at window granularity.
-    throw std::runtime_error("rev_idx size does not match number of windows");
+    throw std::runtime_error("Vision pipeline reverse index size mismatch");
   }
 
-  // Apply reverse indexing at merged window granularity
-  // After merging, we have merged_seq_len tokens, one per original window
   std::vector<float> final_embeddings(merger_out_buf.size());
   for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
-    int64_t src_w = rev_idx_[dst_w];
-    // Each "window" in merged output is now just 1 token with merged_hidden_size features
-    size_t src_offset = static_cast<size_t>(src_w) * static_cast<size_t>(final_hidden);
-    size_t dst_offset = static_cast<size_t>(dst_w) * static_cast<size_t>(final_hidden);
-    std::memcpy(final_embeddings.data() + dst_offset, merger_out_buf.data() + src_offset,
-                static_cast<size_t>(final_hidden) * sizeof(float));
+    std::memcpy(final_embeddings.data() + dst_w * merged_hidden,
+                merger_out_buf.data() + rev_idx_[dst_w] * merged_hidden,
+                merged_hidden * sizeof(float));
   }
 
-  // Save final shape
-  last_seq_len_ = final_seq_len;
-  last_hidden_size_ = final_hidden;
-  return final_embeddings; // shape: [final_seq_len=493, final_hidden=3584]
+  last_seq_len_ = merged_seq_len;
+  last_hidden_size_ = merged_hidden;
+  return final_embeddings;
 }
 
 } // namespace Generators

From 151c15d3f95c65cb5af6f679f182b9deb4299e85 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 13:32:27 -0800
Subject: [PATCH 03/25] Update inference script

---
 examples/python/qwen2_5_vl_inference.py | 325 ++++++------------------
 1 file changed, 80 insertions(+), 245 deletions(-)

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
index a801978815..ec2223053c 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -25,99 +25,58 @@
 PATCH_SIZE = 14
 MERGE_SIZE = 2
 TEMPORAL_PATCH_SIZE = 2
-MIN_PIXELS = 4 * 28 * 28
 MAX_RATIO = 200
 
 def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS):
-    """Baseline's smart_resize logic - rescales image maintaining aspect ratio."""
+    """Rescale image maintaining aspect ratio within pixel bounds."""
     import math
     
-    def ceil_by_factor(number, factor):
-        return math.ceil(number / factor) * factor
-    
-    def floor_by_factor(number, factor):
-        return math.floor(number / factor) * factor
-    
-    def round_by_factor(number, factor):
-        return round(number / factor) * factor
-    
     if max(height, width) / min(height, width) > MAX_RATIO:
         raise ValueError(f"Aspect ratio must be smaller than {MAX_RATIO}")
     
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
+    h_bar = max(factor, round(height / factor) * factor)
+    w_bar = max(factor, round(width / factor) * factor)
     
     if h_bar * w_bar > max_pixels:
         beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
     elif h_bar * w_bar < min_pixels:
         beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
     
     return h_bar, w_bar
 
 def load_prepatched_embeddings(image_path: Path, resize_width=800, resize_height=480):
-    """Load image and convert to pre-patched embeddings format matching baseline.
-    
-    This matches the baseline's approach: manually patch the image in Python
-    before passing to the ONNX vision pipeline. The patch_embed model expects
-    pre-patched data (1, num_patches, patch_dim), NOT raw pixels (B, C, H, W).
-    
-    Args:
-        image_path: Path to image file
-        resize_width: Target width for first resize (default 800)
-        resize_height: Target height for first resize (default 480)
-    
-    Returns:
-        pixel_values: np.ndarray of shape (1, num_patches, patch_dim)
-        grid_thw: (t, h, w) grid dimensions after patching
-    """
-    # Load and convert to RGB
+    """Load and preprocess image into pre-patched format for vision pipeline."""
     img = Image.open(image_path).convert("RGB")
-    orig_w, orig_h = img.size
+    patch_merge_size = PATCH_SIZE * MERGE_SIZE
     
-    # Two-stage resize matching baseline:
-    # 1. First resize to target dimensions with factor=28 constraint
-    patch_merge_size = PATCH_SIZE * MERGE_SIZE  # 14 * 2 = 28
-    h1, w1 = smart_resize(resize_height, resize_width, factor=patch_merge_size, 
-                         min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+    # Two-stage resize with factor constraint
+    h1, w1 = smart_resize(resize_height, resize_width, factor=patch_merge_size)
     img = img.resize((w1, h1), Image.BICUBIC)
-    
-    # 2. Second smart_resize with same constraints (matches baseline fetch_image_data)
-    h2, w2 = smart_resize(h1, w1, factor=patch_merge_size,
-                         min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+    h2, w2 = smart_resize(h1, w1, factor=patch_merge_size)
     img = img.resize((w2, h2), Image.BICUBIC)
     
-    print(f"[INFO] Resized image: {orig_w}x{orig_h} -> {w1}x{h1} -> {w2}x{h2}")
-    
-    # Convert to numpy array (H, W, C) and normalize to [0, 1]
+    # Normalize with ImageNet stats
     pixel_array = np.array(img).astype(np.float32) / 255.0
-    
-    # Apply ImageNet normalization (from Qwen2.5-VL processor config)
     mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
     std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
     pixel_array = (pixel_array - mean) / std
     
-    # --- Patching logic from baseline image_utils.patch_image ---
-    # Start with (H, W, C) format, add batch dimension
-    patches = np.array([pixel_array])  # shape: (1, H, W, C)
-    
     # Convert to (B, C, H, W) format
-    patches = patches.transpose(0, 3, 1, 2)  # shape: (1, C, H, W)
+    patches = np.array([pixel_array]).transpose(0, 3, 1, 2)
     
-    # Handle temporal dimension (for video, but we use single frame)
+    # Pad temporal dimension if needed
     if patches.shape[0] % TEMPORAL_PATCH_SIZE != 0:
-        repeats = np.repeat(patches[-1][np.newaxis], TEMPORAL_PATCH_SIZE - 1, axis=0)
-        patches = np.concatenate([patches, repeats], axis=0)
+        pad_frames = np.repeat(patches[-1:], TEMPORAL_PATCH_SIZE - 1, axis=0)
+        patches = np.concatenate([patches, pad_frames], axis=0)
     
-    channel = patches.shape[1]
-    grid_t = patches.shape[0] // TEMPORAL_PATCH_SIZE
-    grid_h = h2 // PATCH_SIZE
-    grid_w = w2 // PATCH_SIZE
+    channel, grid_t = patches.shape[1], patches.shape[0] // TEMPORAL_PATCH_SIZE
+    grid_h, grid_w = h2 // PATCH_SIZE, w2 // PATCH_SIZE
     
-    # Reshape into patches with spatial merging
+    # Reshape and flatten patches
     patches = patches.reshape(
         grid_t,
         TEMPORAL_PATCH_SIZE,
@@ -130,21 +89,11 @@ def load_prepatched_embeddings(image_path: Path, resize_width=800, resize_height
         PATCH_SIZE,
     )
     
-    # Transpose to group patches spatially
     patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+    flatten_patches = patches.reshape(grid_t * grid_h * grid_w, 
+                                     channel * TEMPORAL_PATCH_SIZE * PATCH_SIZE * PATCH_SIZE)
     
-    # Flatten to (num_patches, patch_dim)
-    flatten_patches = patches.reshape(
-        grid_t * grid_h * grid_w, 
-        channel * TEMPORAL_PATCH_SIZE * PATCH_SIZE * PATCH_SIZE
-    )
-    
-    # Add batch dimension: (1, num_patches, patch_dim)
-    pixel_values = flatten_patches[np.newaxis, :]
-    # Calculate grid dimensions for image_grid_thw
-    grid_thw = np.array([[grid_t, grid_h, grid_w]], dtype=np.int64)
-    
-    return pixel_values, grid_thw
+    return flatten_patches[np.newaxis, :], np.array([[grid_t, grid_h, grid_w]], dtype=np.int64)
 
 TOOL_CALL_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
 
@@ -231,21 +180,10 @@ def build_prompt_from_sample(sample_json_path: Path, use_tool_call_prompt=False,
     return prompt, image_count
 
 def expand_image_tokens_in_prompt(base_prompt: str, image_grid_thw, merge_size: int):
-    """Expand single <|image_pad|> placeholder to multiple tokens based on actual image patches.
-    
-    This replicates the logic from baseline's get_image_padding_from_text:
-    - Calculates num_tokens = (t * h * w) / (merge_size^2)
-    - Replaces first occurrence of <|image_pad|> with that many <|image_pad|> tokens
-    """
+    """Expand single <|image_pad|> placeholder to multiple tokens."""
     t, h, w = image_grid_thw
-    merge_area = merge_size ** 2
-    num_tokens = (t * h * w) // merge_area
-    
-    # Replace first occurrence of IMAGE_PAD_TOKEN with num_tokens copies
-    # (matches baseline behavior: replace once per image)
-    expanded_prompt = base_prompt.replace(IMAGE_PAD_TOKEN, IMAGE_PAD_TOKEN * num_tokens, 1)
-    
-    return expanded_prompt, num_tokens
+    num_tokens = (t * h * w) // (merge_size ** 2)
+    return base_prompt.replace(IMAGE_PAD_TOKEN, IMAGE_PAD_TOKEN * num_tokens, 1), num_tokens
 
 
 def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float, sample_dir: Path | None = None,
@@ -256,227 +194,124 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
     if not image_path.is_file():
         raise FileNotFoundError(f"Image file not found: {image_path}")
 
-    # 1. Load raw pixel values (B, C, H, W) for GenAI vision pipeline
     pixel_values, grid_thw_array = load_prepatched_embeddings(image_path)
-    grid_thw = grid_thw_array[0]  # Extract (t, h, w) tuple
-    merge_size = MERGE_SIZE
-    
-    # DEBUG: Run vision pipeline manually to compare embeddings with baseline
-    # run_vision_pipeline_debug(pixel_values, config_dir)
+    grid_thw = grid_thw_array[0]
     
-    # NOTE: GenAI will automatically run the vision pipeline (patch_embed -> vision_attn -> patch_merger)
-    # when pixel_values are provided. No need to run it manually.
-    
-    # 2. Load model & tokenizer FIRST (needed for apply_chat_template)
-    # Optionally register and prioritize QNN EP for vision attention acceleration.
+    # Load model with optional QNN acceleration
     if enable_qnn:
-        # Dynamically register QNN EP provider library if supplied (needed when ORT not built with QNN statically)
         if qnn_provider_library:
             og.register_execution_provider_library("QNN", qnn_provider_library)
-        cfg_path = (config_dir / "genai_config.json") if (config_dir / "genai_config.json").is_file() else config_dir
-        cfg = og.Config(str(cfg_path))
+        cfg = og.Config(str(config_dir / "genai_config.json" if (config_dir / "genai_config.json").is_file() else config_dir))
         cfg.append_provider("QNN")
-        cfg.append_provider("CPUExecutionProvider")  # fallback for non-attention stages
+        cfg.append_provider("CPUExecutionProvider")
         if qnn_backend_path:
             cfg.set_provider_option("QNN", "backend_path", qnn_backend_path)
-        # Burst performance mode (3) if not overridden
         cfg.set_provider_option("QNN", "performance_mode", "3")
         model = og.Model(cfg)
     else:
-        model = og.Model(str(config_dir))  # expects genai_config.json inside
+        model = og.Model(str(config_dir))
     
-    # Use HuggingFace tokenizer instead of ORT tokenizer to match baseline behavior
     tokenizer_hf = AutoTokenizer.from_pretrained(str(config_dir), trust_remote_code=True)
-    
-    # Also create ORT tokenizer for streaming decode during generation
     tokenizer_ort = og.Tokenizer(model)
 
-    # Build prompt AFTER tokenizer is loaded (needed for apply_chat_template)
-    if sample_dir is not None:
-        base_prompt, image_count_in_sample = build_prompt_from_sample(sample_dir / "sample.json", use_tool_call_prompt=tool_call_prompt, tokenizer=tokenizer_hf)
-        prompt, num_image_tokens = expand_image_tokens_in_prompt(base_prompt, grid_thw, merge_size)
+    if sample_dir:
+        base_prompt, _ = build_prompt_from_sample(sample_dir / "sample.json", tool_call_prompt, tokenizer_hf)
+        prompt, num_image_tokens = expand_image_tokens_in_prompt(base_prompt, grid_thw, MERGE_SIZE)
     else:
-        num_image_tokens = expand_image_tokens(grid_thw, merge_size)
-        prompt = build_prompt(prompt_text, num_image_tokens, use_tool_call_prompt=tool_call_prompt)
+        num_image_tokens = expand_image_tokens(grid_thw, MERGE_SIZE)
+        prompt = build_prompt(prompt_text, num_image_tokens, tool_call_prompt)
 
-    # Verify image token id exists
-    image_token_id = tokenizer_hf.convert_tokens_to_ids(IMAGE_PAD_TOKEN)
-    if image_token_id is None:
-        raise RuntimeError(f"Image token {IMAGE_PAD_TOKEN} not found in tokenizer")
-    
-    # Encode using HuggingFace tokenizer
-    input_ids_list = tokenizer_hf.encode(prompt)
-    input_ids_np = np.array(input_ids_list, dtype=np.int32)
+    input_ids_np = np.array(tokenizer_hf.encode(prompt), dtype=np.int32)
 
-    # Sanity check: count occurrences
-    occurrences = int(np.sum(input_ids_np == image_token_id))
-    if occurrences != num_image_tokens:
-        print(f"[WARN] Token count mismatch: expected {num_image_tokens}, tokenizer found {occurrences}")
-
-    # 4. Prepare generation params (respect model context_length)
     params = og.GeneratorParams(model)
-    # Fetch context_length from config if available; else default to max_new_tokens
-    context_len = None
     try:
-        cfg_file2 = config_dir / "genai_config.json" if config_dir.is_dir() else config_dir
-        with open(cfg_file2, "r", encoding="utf-8") as f2:
-            cfgj = json.load(f2)
-            mdl = cfgj.get("model", {}) if isinstance(cfgj, dict) else {}
-            if isinstance(mdl, dict) and "context_length" in mdl:
-                context_len = int(mdl["context_length"])
+        with open(config_dir / "genai_config.json", "r") as f:
+            context_len = json.load(f).get("model", {}).get("context_length", input_ids_np.shape[0] + max_new_tokens)
     except Exception:
-        context_len = None
-    # Align with reference semantics:
-    # - Use model context capacity for total max_length (prefill + generation)
-    # - Cap number of generated tokens separately via loop counter
-    total_capacity = int(context_len) if context_len else int(input_ids_np.shape[0] + max_new_tokens)
-    gen_cap = int(max_new_tokens)
-    params.set_search_options(max_length=total_capacity, temperature=temperature, top_k=top_k, top_p=top_p,
-                              do_sample=bool(do_sample), min_length=int(min_length), repetition_penalty=float(repetition_penalty))
+        context_len = input_ids_np.shape[0] + max_new_tokens
+    
+    params.set_search_options(max_length=context_len, temperature=temperature, top_k=top_k, top_p=top_p,
+                              do_sample=do_sample, min_length=min_length, repetition_penalty=repetition_penalty)
 
     generator = og.Generator(model, params)
-
-    # 5. Set pixel_values as input - GenAI will automatically run vision pipeline
-    # Note: pixel_values is pre-patched format (1, num_patches, patch_dim), matching baseline
-    pixel_values_f32 = np.ascontiguousarray(pixel_values.astype(np.float32))
-    generator.set_model_input("pixel_values", pixel_values_f32)
+    generator.set_model_input("pixel_values", np.ascontiguousarray(pixel_values.astype(np.float32)))
     generator.set_model_input("image_grid_thw", np.ascontiguousarray(grid_thw_array.astype(np.int64)))
+    generator.append_tokens(input_ids_np)
 
-    # 6. Append textual tokens (chunked to satisfy context model input length)
-    input_ids_i32 = input_ids_np.astype(np.int32)
-    # Do not truncate the prompt; the runtime will process it in windows
-    # according to the configured sliding window and chunk size.
-    # Log final prefill window preview
-    try:
-        preview_len = min(50, input_ids_i32.shape[0])
-    except Exception as e:
-        print(f"[DEBUG] Failed to decode preview: {e}")
-    # Append the full prompt once to avoid QNN continuous decoding constraints
-    generator.append_tokens(input_ids_i32)
-
-    # 7. Stream generation
     stream = tokenizer_ort.create_stream()
     output_tokens = []
     print("\n=== Generating ===")
-    # Stream generation; rely on runtime-managed sequence lengths
-    step_idx = 0
-    # Read EOS token id(s) from config for early stop
-    eos_ids = []
+    
     try:
-        cfg_file_eos = config_dir / "genai_config.json" if config_dir.is_dir() else config_dir
-        with open(cfg_file_eos, "r", encoding="utf-8") as f_eos:
-            cfgj_eos = json.load(f_eos)
-            mdl = cfgj_eos.get("model", {}) if isinstance(cfgj_eos, dict) else {}
-            if isinstance(mdl, dict):
-                eos_val = mdl.get("eos_token_id")
-                if isinstance(eos_val, list):
-                    eos_ids = [int(x) for x in eos_val]
-                elif isinstance(eos_val, (int, float)):
-                    eos_ids = [int(eos_val)]
+        with open(config_dir / "genai_config.json", "r") as f:
+            eos_val = json.load(f).get("model", {}).get("eos_token_id", [])
+            eos_ids = eos_val if isinstance(eos_val, list) else [eos_val] if eos_val else []
     except Exception:
         eos_ids = []
     
-    # Tool-call mode state tracking
-    accum_text = ""
-    started_toolcall = False
-    closed_toolcall = False
+    accum_text, started_toolcall, closed_toolcall, step = "", False, False, 0
     
-    while not generator.is_done():
-        try:
-            generator.generate_next_token()
-        except Exception as gen_err:
-            print(f"[ERROR] generate_next_token failed at step {step_idx}: {gen_err}")
-            raise
+    while not generator.is_done() and step < max_new_tokens:
+        generator.generate_next_token()
         new_tok = generator.get_next_tokens()[0]
         output_tokens.append(new_tok)
-        # Stop on EOS after min_length tokens
-        if eos_ids and int(new_tok) in eos_ids and step_idx >= int(min_length):
+        
+        if eos_ids and new_tok in eos_ids and step >= min_length:
             break
+        
         decoded_piece = stream.decode(new_tok)
         
         if tool_call_prompt:
-            # In tool-call mode: buffer text and only print from <tool_call> onwards
             accum_text += decoded_piece
-            if not started_toolcall:
-                if "<tool_call>" in accum_text:
-                    started_toolcall = True
-                    idx = accum_text.index("<tool_call>")
-                    sys.stdout.write(accum_text[idx:])
-                    sys.stdout.flush()
-            else:
-                # Already started printing tool_call region
+            if not started_toolcall and "<tool_call>" in accum_text:
+                started_toolcall = True
+                sys.stdout.write(accum_text[accum_text.index("<tool_call>"):])
+                sys.stdout.flush()
+            elif started_toolcall:
                 sys.stdout.write(decoded_piece)
                 sys.stdout.flush()
                 if "</tool_call>" in accum_text:
                     closed_toolcall = True
-                    print(f"\n[DEBUG] tool_call closed at step {step_idx}")
                     break
-        else:
-            # Normal mode: print everything
-            if decoded_piece:
-                sys.stdout.write(decoded_piece)
-                sys.stdout.flush()
+        elif decoded_piece:
+            sys.stdout.write(decoded_piece)
+            sys.stdout.flush()
         
-        # print(f"\n[DEBUG] Gen step {step_idx}: token_id={new_tok} decoded='{decoded_piece}'")
-        step_idx += 1
-        if step_idx >= gen_cap:
-            break
+        step += 1
     
     print("\n=== Generation Complete ===")
-
     full_output = tokenizer_hf.decode(np.array(output_tokens, dtype=np.int32))
     
-    # Report whether tool_call was successfully emitted
     if tool_call_prompt and not closed_toolcall:
-        print("\n[WARNING] Model did not emit complete <tool_call>...</tool_call> structure.")
-        print("[WARNING] Consider adjusting prompt, temperature, or sampling parameters.")
+        print("\n[WARNING] Model did not emit complete <tool_call> structure.")
     
-    # Write to file instead of stdout to handle Unicode characters
-    # with open("generation_output.txt", "w", encoding="utf-8") as f:
-    #     f.write("\n[FINAL OUTPUT]\n" + full_output)
-    print("\n FINAL OUTPUT: ", full_output)
+    print("\nFINAL OUTPUT:", full_output)
     return full_output
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Qwen2.5-VL inference using onnxruntime-genai pipeline")
-    # Support both --config_dir (current) and legacy --model_path name.
-    parser.add_argument("--config_dir", type=Path, help="Directory containing genai_config.json for qwen2_5_vl (or use --model_path)")
-    parser.add_argument("--model_path", type=Path, help="Alias for --config_dir (legacy)")
+    parser = argparse.ArgumentParser(description="Qwen2.5-VL inference using onnxruntime-genai")
+    parser.add_argument("--config_dir", "--model_path", type=Path, required=True, help="Directory with genai_config.json")
     parser.add_argument("--image", type=Path, required=True, help="Path to input image")
-    parser.add_argument("--prompt", type=str, help="User text prompt; if omitted and --sample_dir provided, sample conversation is used")
+    parser.add_argument("--prompt", type=str, default="Describe the image.", help="User text prompt")
     parser.add_argument("--max_new_tokens", type=int, default=4096)
     parser.add_argument("--temperature", type=float, default=0.7)
     parser.add_argument("--top_k", type=int, default=50)
     parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--do_sample", action="store_true", help="Enable sampling to reduce repetition")
-    parser.add_argument("--min_length", type=int, default=0, help="Minimum generated tokens before allowing EOS")
-    parser.add_argument("--repetition_penalty", type=float, default=1.0, help=">1.0 discourages repetition")
-    parser.add_argument("--sample_dir", type=Path, help="Optional dataset sample directory (contains sample.json & image)")
-    parser.add_argument("--enable_qnn", action="store_true", help="Enable QNN execution provider (vision attention acceleration)")
-    parser.add_argument("--qnn_backend_path", type=str, default="QnnHtp.dll", help="Path to QNN backend (e.g., QnnHtp.dll)")
-    parser.add_argument("--qnn_provider_library", type=str, help="Path to onnxruntime QNN EP shared library (e.g., onnxruntime_providers_qnn.dll)")
-    parser.add_argument("--tool_call_prompt", action="store_true", help="Enable tool-call mode: use baseline tools schema and emit <tool_call> XML")
+    parser.add_argument("--do_sample", action="store_true")
+    parser.add_argument("--min_length", type=int, default=0)
+    parser.add_argument("--repetition_penalty", type=float, default=1.0)
+    parser.add_argument("--sample_dir", type=Path, help="Sample directory with sample.json")
+    parser.add_argument("--enable_qnn", action="store_true")
+    parser.add_argument("--qnn_backend_path", type=str, default="QnnHtp.dll")
+    parser.add_argument("--qnn_provider_library", type=str)
+    parser.add_argument("--tool_call_prompt", action="store_true")
     args = parser.parse_args()
 
-    # Resolve config directory
-    config_dir = args.config_dir or args.model_path
-    if not config_dir:
-        parser.error("One of --config_dir or --model_path is required.")
-
-    # Determine prompt text
-    if args.prompt:
-        prompt_text = args.prompt
-    elif args.sample_dir is not None:
-        # Will be built from sample.json
-        prompt_text = ""  # placeholder; not used when sample_dir provided
-    else:
-        prompt_text = "Describe the image."  # default fallback
-
     run_inference(
-        config_dir=config_dir,
+        config_dir=args.config_dir,
         image_path=args.image,
-        prompt_text=prompt_text,
+        prompt_text=args.prompt if not args.sample_dir else "",
         max_new_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_k=args.top_k,
@@ -487,7 +322,7 @@ def main():
         sample_dir=args.sample_dir,
         enable_qnn=args.enable_qnn,
         qnn_backend_path=args.qnn_backend_path if args.enable_qnn else None,
-        qnn_provider_library=args.qnn_provider_library if args.enable_qnn else None,
+        qnn_provider_library=args.qnn_provider_library,
         tool_call_prompt=args.tool_call_prompt,
     )
 

From 014eed6a132439381e65892fd06db835170ec484 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 14:38:48 -0800
Subject: [PATCH 04/25] More cleanup

---
 examples/python/qwen2_5_vl_inference.py | 200 ++++++------------------
 src/generators.cpp                      |  21 ++-
 src/models/decoder_only.cpp             |  23 ++-
 src/models/decoder_only_pipeline.cpp    |   2 +-
 src/models/kv_cache.cpp                 |  28 +++-
 5 files changed, 103 insertions(+), 171 deletions(-)

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
index ec2223053c..752217dcea 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -16,7 +16,6 @@
 VISION_END = "<|vision_end|>"
 IM_START = "<|im_start|>"
 IM_END = "<|im_end|>"
-SYSTEM_PROMPT = "You are a helpful assistant."
 
 # Image preprocessing constants (from Qwen2.5-VL config)
 IMAGE_FACTOR = 28
@@ -108,87 +107,8 @@ def load_prepatched_embeddings(image_path: Path, resize_width=800, resize_height
 </tool_call>
 """
 
-def expand_image_tokens(grid_thw, merge_size):
-    """Compute number of image pad tokens after spatial merging.
-    Qwen2.5-VL vision pipeline produces one embedding per merged spatial cell.
-    Token count = (t * h * w) / (merge_size ** 2)."""
-    t, h, w = grid_thw
-    merge_area = merge_size ** 2
-    if (h * w) % merge_area != 0:
-        raise ValueError(f"Grid (h={h}, w={w}) not divisible by merge_size^2={merge_area}")
-    return (t * h * w) // merge_area
-
-
-def build_prompt(user_text, num_image_tokens, use_tool_call_prompt=False):
-    # Construct minimal chat-style prompt with expanded image pad tokens.
-    image_tokens = IMAGE_PAD_TOKEN * num_image_tokens
-    # Wrap in vision start/end markers once (matching template semantically) but repeated pad tokens inside.
-    vision_block = f"{VISION_START}{image_tokens}{VISION_END}"
-    system_text = TOOL_CALL_SYSTEM_PROMPT if use_tool_call_prompt else SYSTEM_PROMPT
-    prompt = (
-        f"{IM_START}system\n{system_text}{IM_END}\n"
-        f"{IM_START}user\n{vision_block}{user_text}{IM_END}\n"
-        f"{IM_START}assistant\n"
-    )
-    return prompt
-
-def build_prompt_from_sample(sample_json_path: Path, use_tool_call_prompt=False, tokenizer=None):
-    """Construct base prompt (single <|image_pad|>) from sample.json conversation using apply_chat_template."""
-    with open(sample_json_path, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    convo = data.get("conversation", [])
-    
-    # Count images in the conversation
-    image_count = 0
-    for msg in convo:
-        if msg.get("role") == "user":
-            content_list = msg.get("content", [])
-            for c in content_list:
-                if isinstance(c, dict) and (c.get("type") == "image" or "image" in c):
-                    image_count += 1
-    
-    if image_count == 0:
-        raise ValueError("Sample JSON contained no image entries; cannot build vision prompt.")
-    
-    # Use apply_chat_template to match baseline behavior exactly
-    if tokenizer is not None:
-        prompt = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-    else:
-        # Fallback to manual construction (shouldn't happen if tokenizer is passed)
-        system_text = ""
-        user_parts = []
-        for msg in convo:
-            role = msg.get("role")
-            content_list = msg.get("content", [])
-            if role == "system":
-                system_parts = [c.get("text", "") for c in content_list if isinstance(c, dict)]
-                system_text = "\n".join(system_parts)
-            elif role == "user":
-                for c in content_list:
-                    if isinstance(c, dict):
-                        if c.get("type") == "image" or "image" in c:
-                            user_parts.append(f"{VISION_START}{IMAGE_PAD_TOKEN}{VISION_END}")
-                        elif c.get("type") == "text":
-                            user_parts.append(c.get("text", ""))
-        user_text = "".join(user_parts)
-        prompt = (
-            f"{IM_START}system\n{system_text}{IM_END}\n"
-            f"{IM_START}user\n{user_text}{IM_END}\n"
-            f"{IM_START}assistant\n"
-        )
-    
-    return prompt, image_count
-
-def expand_image_tokens_in_prompt(base_prompt: str, image_grid_thw, merge_size: int):
-    """Expand single <|image_pad|> placeholder to multiple tokens."""
-    t, h, w = image_grid_thw
-    num_tokens = (t * h * w) // (merge_size ** 2)
-    return base_prompt.replace(IMAGE_PAD_TOKEN, IMAGE_PAD_TOKEN * num_tokens, 1), num_tokens
-
-
-def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float, sample_dir: Path | None = None,
-                  enable_qnn: bool = False, qnn_backend_path: str | None = None, qnn_provider_library: str | None = None,
-                  do_sample: bool = False, min_length: int = 0, repetition_penalty: float = 1.0, tool_call_prompt: bool = False):
+def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float,
+                  do_sample: bool = False, min_length: int = 0, repetition_penalty: float = 1.0):
     if not config_dir.is_dir():
         raise FileNotFoundError(f"Config directory not found: {config_dir}")
     if not image_path.is_file():
@@ -197,93 +117,77 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
     pixel_values, grid_thw_array = load_prepatched_embeddings(image_path)
     grid_thw = grid_thw_array[0]
     
-    # Load model with optional QNN acceleration
-    if enable_qnn:
-        if qnn_provider_library:
-            og.register_execution_provider_library("QNN", qnn_provider_library)
-        cfg = og.Config(str(config_dir / "genai_config.json" if (config_dir / "genai_config.json").is_file() else config_dir))
-        cfg.append_provider("QNN")
-        cfg.append_provider("CPUExecutionProvider")
-        if qnn_backend_path:
-            cfg.set_provider_option("QNN", "backend_path", qnn_backend_path)
-        cfg.set_provider_option("QNN", "performance_mode", "3")
-        model = og.Model(cfg)
-    else:
-        model = og.Model(str(config_dir))
-    
+    model = og.Model(str(config_dir))
     tokenizer_hf = AutoTokenizer.from_pretrained(str(config_dir), trust_remote_code=True)
     tokenizer_ort = og.Tokenizer(model)
 
-    if sample_dir:
-        base_prompt, _ = build_prompt_from_sample(sample_dir / "sample.json", tool_call_prompt, tokenizer_hf)
-        prompt, num_image_tokens = expand_image_tokens_in_prompt(base_prompt, grid_thw, MERGE_SIZE)
-    else:
-        num_image_tokens = expand_image_tokens(grid_thw, MERGE_SIZE)
-        prompt = build_prompt(prompt_text, num_image_tokens, tool_call_prompt)
-
-    input_ids_np = np.array(tokenizer_hf.encode(prompt), dtype=np.int32)
-
-    params = og.GeneratorParams(model)
+    # Build prompt with image tokens
+    num_image_tokens = (grid_thw[0] * grid_thw[1] * grid_thw[2]) // (MERGE_SIZE ** 2)
+    image_text = VISION_START + IMAGE_PAD_TOKEN * num_image_tokens + VISION_END
+    conversation = [
+        {"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT},
+        {"role": "user", "content": image_text + prompt_text},
+    ]
+    prompt = tokenizer_hf.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    prompt = prompt.replace(IMAGE_PAD_TOKEN, IMAGE_PAD_TOKEN * num_image_tokens, 1)
+    input_ids = np.array(tokenizer_hf.encode(prompt), dtype=np.int32)
+
+    # Setup generation parameters
     try:
         with open(config_dir / "genai_config.json", "r") as f:
-            context_len = json.load(f).get("model", {}).get("context_length", input_ids_np.shape[0] + max_new_tokens)
+            config = json.load(f)
+            context_len = config.get("model", {}).get("context_length", input_ids.shape[0] + max_new_tokens)
+            eos_val = config.get("model", {}).get("eos_token_id", [])
+            eos_ids = eos_val if isinstance(eos_val, list) else [eos_val] if eos_val else []
     except Exception:
-        context_len = input_ids_np.shape[0] + max_new_tokens
+        context_len = input_ids.shape[0] + max_new_tokens
+        eos_ids = []
     
+    params = og.GeneratorParams(model)
     params.set_search_options(max_length=context_len, temperature=temperature, top_k=top_k, top_p=top_p,
                               do_sample=do_sample, min_length=min_length, repetition_penalty=repetition_penalty)
 
     generator = og.Generator(model, params)
     generator.set_model_input("pixel_values", np.ascontiguousarray(pixel_values.astype(np.float32)))
     generator.set_model_input("image_grid_thw", np.ascontiguousarray(grid_thw_array.astype(np.int64)))
-    generator.append_tokens(input_ids_np)
-
+    generator.append_tokens(input_ids)
+    
+    # Generate and extract tool_call
     stream = tokenizer_ort.create_stream()
     output_tokens = []
+    accum_text = ""
+    started_toolcall = False
     print("\n=== Generating ===")
     
-    try:
-        with open(config_dir / "genai_config.json", "r") as f:
-            eos_val = json.load(f).get("model", {}).get("eos_token_id", [])
-            eos_ids = eos_val if isinstance(eos_val, list) else [eos_val] if eos_val else []
-    except Exception:
-        eos_ids = []
-    
-    accum_text, started_toolcall, closed_toolcall, step = "", False, False, 0
-    
-    while not generator.is_done() and step < max_new_tokens:
+    for step in range(max_new_tokens):
+        if generator.is_done():
+            break
+        
         generator.generate_next_token()
-        new_tok = generator.get_next_tokens()[0]
-        output_tokens.append(new_tok)
+        token = generator.get_next_tokens()[0]
+        output_tokens.append(token)
         
-        if eos_ids and new_tok in eos_ids and step >= min_length:
+        if eos_ids and token in eos_ids and step >= min_length:
             break
         
-        decoded_piece = stream.decode(new_tok)
+        decoded = stream.decode(token)
+        accum_text += decoded
         
-        if tool_call_prompt:
-            accum_text += decoded_piece
-            if not started_toolcall and "<tool_call>" in accum_text:
-                started_toolcall = True
-                sys.stdout.write(accum_text[accum_text.index("<tool_call>"):])
-                sys.stdout.flush()
-            elif started_toolcall:
-                sys.stdout.write(decoded_piece)
-                sys.stdout.flush()
-                if "</tool_call>" in accum_text:
-                    closed_toolcall = True
-                    break
-        elif decoded_piece:
-            sys.stdout.write(decoded_piece)
+        if not started_toolcall and "<tool_call>" in accum_text:
+            started_toolcall = True
+            sys.stdout.write(accum_text[accum_text.index("<tool_call>"):])
             sys.stdout.flush()
-        
-        step += 1
+        elif started_toolcall:
+            sys.stdout.write(decoded)
+            sys.stdout.flush()
+            if "</tool_call>" in accum_text:
+                break
     
     print("\n=== Generation Complete ===")
     full_output = tokenizer_hf.decode(np.array(output_tokens, dtype=np.int32))
     
-    if tool_call_prompt and not closed_toolcall:
-        print("\n[WARNING] Model did not emit complete <tool_call> structure.")
+    if started_toolcall and "</tool_call>" not in accum_text:
+        print("[WARNING] Incomplete <tool_call> structure")
     
     print("\nFINAL OUTPUT:", full_output)
     return full_output
@@ -301,17 +205,12 @@ def main():
     parser.add_argument("--do_sample", action="store_true")
     parser.add_argument("--min_length", type=int, default=0)
     parser.add_argument("--repetition_penalty", type=float, default=1.0)
-    parser.add_argument("--sample_dir", type=Path, help="Sample directory with sample.json")
-    parser.add_argument("--enable_qnn", action="store_true")
-    parser.add_argument("--qnn_backend_path", type=str, default="QnnHtp.dll")
-    parser.add_argument("--qnn_provider_library", type=str)
-    parser.add_argument("--tool_call_prompt", action="store_true")
     args = parser.parse_args()
 
     run_inference(
         config_dir=args.config_dir,
         image_path=args.image,
-        prompt_text=args.prompt if not args.sample_dir else "",
+        prompt_text=args.prompt,
         max_new_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_k=args.top_k,
@@ -319,11 +218,6 @@ def main():
         do_sample=args.do_sample,
         min_length=args.min_length,
         repetition_penalty=args.repetition_penalty,
-        sample_dir=args.sample_dir,
-        enable_qnn=args.enable_qnn,
-        qnn_backend_path=args.qnn_backend_path if args.enable_qnn else None,
-        qnn_provider_library=args.qnn_provider_library,
-        tool_call_prompt=args.tool_call_prompt,
     )
 
 if __name__ == "__main__":
diff --git a/src/generators.cpp b/src/generators.cpp
index c16cbfdd50..4c0b8cf358 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -319,13 +319,22 @@ DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t>
   auto input_ids_device = state_->params_->p_device->Allocate<int32_t>(padded_input_ids_size);
   auto cpu_span = input_ids_device.CpuSpan();
   
-  // For sliding windows during prompt processing:
-  // - Copy actual tokens starting at position 0
-  // - Fill remaining positions with padding
-  // The alignment setting affects KV cache behavior, not token placement
-  std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin());
+  // Handle padding based on alignment setting for sliding window models
   if (padded_input_ids_size > input_ids.size()) {
-    std::fill(cpu_span.begin() + input_ids.size(), cpu_span.end(), model_->config_->model.pad_token_id);
+    const bool left_align = model_->config_->model.decoder.sliding_window.has_value() && 
+                           model_->config_->model.decoder.sliding_window->alignment == "left";
+    
+    if (left_align) {
+      // Left alignment: padding first, then data
+      std::fill_n(cpu_span.begin(), padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id);
+      std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin() + (padded_input_ids_size - input_ids.size()));
+    } else {
+      // Right alignment (default): data first, then padding
+      std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin());
+      std::fill(cpu_span.begin() + input_ids.size(), cpu_span.end(), model_->config_->model.pad_token_id);
+    }
+  } else {
+    std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin());
   }
   input_ids_device.CopyCpuToDevice();
   return input_ids_device;
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 4aa0cf533d..9892a0698c 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -86,15 +86,28 @@ void DecoderOnly_State::RewindTo(size_t index) {
 void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> beam_indices, int total_length) {
   input_ids_.Update(next_tokens);
   size_t new_length = static_cast<size_t>(input_ids_.GetShape()[1]);
-  // Clamp KV cache length to sliding window size if configured
-  int effective_total_length = total_length;
+  
+  // Determine effective lengths for position_ids and KV cache based on sliding window config
+  int position_length = total_length;
+  int kv_cache_length = total_length;
+  
   if (model_.config_->model.decoder.sliding_window.has_value() &&
       model_.config_->model.decoder.sliding_window->window_size > 0) {
-    effective_total_length = std::min(effective_total_length, model_.config_->model.decoder.sliding_window->window_size);
+    const int window_size = model_.config_->model.decoder.sliding_window->window_size;
+    
+    // Position IDs are clamped when slide_inputs is true
+    if (model_.config_->model.decoder.sliding_window->slide_inputs) {
+      position_length = std::min(total_length, window_size);
+    }
+    
+    // KV cache is clamped when slide_key_value_cache is true
+    if (model_.config_->model.decoder.sliding_window->slide_key_value_cache) {
+      kv_cache_length = std::min(total_length, window_size);
+    }
   }
 
-  position_inputs_->Update(next_tokens, effective_total_length, static_cast<int>(new_length));
-  kv_cache_->Update(beam_indices, effective_total_length);
+  position_inputs_->Update(next_tokens, position_length, static_cast<int>(new_length));
+  kv_cache_->Update(beam_indices, kv_cache_length);
   logits_.Update(next_tokens, new_length);
 }
 
diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp
index 15d5c89638..497bd1b295 100644
--- a/src/models/decoder_only_pipeline.cpp
+++ b/src/models/decoder_only_pipeline.cpp
@@ -205,7 +205,7 @@ void DecoderOnlyPipelineState::RunPipeline(int total_length, DeviceSpan<int32_t>
     } else if (!first_run_ && !model_.config_->model.decoder.pipeline[pipeline_state->id_].run_on_token_gen) {
       continue;
     }
-    
+
     DurationTrace trace{MakeString("DecoderOnlyPipelineState::RunPipeline[", pipeline_state->id_, "]")};
 
     if (model_.config_->model.decoder.pipeline[pipeline_state->id_].reset_session_idx > -1) {
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 451a4c0f10..2fb0ab140c 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -203,7 +203,9 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
       shape_[2] = std::min(max_length, sliding_window_size);
     }
   } else {
-    // Default capacity: use requested max_length regardless of buffer sharing
+    // Default capacity: allocate full max_length upfront
+    // - With past_present_share_buffer: buffers are reused, so full capacity needed
+    // - Without past_present_share_buffer: buffers are reallocated each step but still sized to max_length
     shape_[2] = state_.params_->search.max_length;
   }
 
@@ -271,18 +273,32 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
   }
 
   if (!layer_shapes_.empty()) {
-    // Allocate present tensors to full per-layer capacity; runtime uses effective length internally
+    // Per-layer allocation with per-layer capacity constraints
     for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
-      const std::array<int64_t, 4> capacity_shape = layer_shapes_[layer_idx];
+      std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
+      
+      // With buffer sharing: use full capacity (buffers are reused)
+      // Without buffer sharing: use actual length for memory efficiency
+      if (!past_present_share_buffer_) {
+        const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
+        current_shape[2] = std::min(total_length, max_cache_length);
+      }
+
       // Key tensor
-      presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), capacity_shape, type_);
+      presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
       state_.outputs_[output_index_ + layer_idx * 2] = presents_[layer_idx * 2].get();
+
       // Value tensor
-      presents_[layer_idx * 2 + 1] = OrtValue::CreateTensor(Allocator(), capacity_shape, type_);
+      presents_[layer_idx * 2 + 1] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
       state_.outputs_[output_index_ + layer_idx * 2 + 1] = presents_[layer_idx * 2 + 1].get();
     }
   } else {
-    // Uniform capacity allocation (shape_[2] set at construction to max_length)
+    // Uniform allocation
+    // With buffer sharing: use full capacity (buffers are reused)
+    // Without buffer sharing: use actual length for memory efficiency
+    if (!past_present_share_buffer_) {
+      shape_[2] = total_length;
+    }
     for (int i = 0; i < layer_count_ * 2; i++) {
       presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
       state_.outputs_[output_index_ + i] = presents_[i].get();

From de74b6bcd8e29579afe5f9164e8b3f1b5c03152d Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 15:44:39 -0800
Subject: [PATCH 05/25] Use extensions pre-processing

---
 cmake/deps.txt                            |   2 +-
 examples/python/qwen2_5_vl_inference.py   | 156 +++++-----------------
 src/models/model.cpp                      |   4 +-
 src/models/qwen2_5_vl_image_processor.cpp |  82 ++++++++++++
 src/models/qwen2_5_vl_image_processor.h   |  23 ++++
 5 files changed, 146 insertions(+), 121 deletions(-)
 create mode 100644 src/models/qwen2_5_vl_image_processor.cpp
 create mode 100644 src/models/qwen2_5_vl_image_processor.h

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 7e50996352..5ddf7e7e54 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;245f6667babf9668b862ac4513c69ea95117c295
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;9424eab4dec2b438642910e27b4f5e9b875b9a5f
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d
diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
index 752217dcea..98f497cc81 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -3,96 +3,8 @@
 import sys
 import numpy as np
 from pathlib import Path
-from PIL import Image
-from transformers import AutoTokenizer
 
-import onnxruntime_genai as og  # Requires built/installed onnxruntime-genai Python package
-
-# ----------------------------------------------------------------------------
-# Helper: build expanded image token sequence matching vision embeddings count
-# ----------------------------------------------------------------------------
-IMAGE_PAD_TOKEN = "<|image_pad|>"
-VISION_START = "<|vision_start|>"
-VISION_END = "<|vision_end|>"
-IM_START = "<|im_start|>"
-IM_END = "<|im_end|>"
-
-# Image preprocessing constants (from Qwen2.5-VL config)
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-PATCH_SIZE = 14
-MERGE_SIZE = 2
-TEMPORAL_PATCH_SIZE = 2
-MAX_RATIO = 200
-
-def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS):
-    """Rescale image maintaining aspect ratio within pixel bounds."""
-    import math
-    
-    if max(height, width) / min(height, width) > MAX_RATIO:
-        raise ValueError(f"Aspect ratio must be smaller than {MAX_RATIO}")
-    
-    h_bar = max(factor, round(height / factor) * factor)
-    w_bar = max(factor, round(width / factor) * factor)
-    
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = math.floor(height / beta / factor) * factor
-        w_bar = math.floor(width / beta / factor) * factor
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = math.ceil(height * beta / factor) * factor
-        w_bar = math.ceil(width * beta / factor) * factor
-    
-    return h_bar, w_bar
-
-def load_prepatched_embeddings(image_path: Path, resize_width=800, resize_height=480):
-    """Load and preprocess image into pre-patched format for vision pipeline."""
-    img = Image.open(image_path).convert("RGB")
-    patch_merge_size = PATCH_SIZE * MERGE_SIZE
-    
-    # Two-stage resize with factor constraint
-    h1, w1 = smart_resize(resize_height, resize_width, factor=patch_merge_size)
-    img = img.resize((w1, h1), Image.BICUBIC)
-    h2, w2 = smart_resize(h1, w1, factor=patch_merge_size)
-    img = img.resize((w2, h2), Image.BICUBIC)
-    
-    # Normalize with ImageNet stats
-    pixel_array = np.array(img).astype(np.float32) / 255.0
-    mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
-    std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
-    pixel_array = (pixel_array - mean) / std
-    
-    # Convert to (B, C, H, W) format
-    patches = np.array([pixel_array]).transpose(0, 3, 1, 2)
-    
-    # Pad temporal dimension if needed
-    if patches.shape[0] % TEMPORAL_PATCH_SIZE != 0:
-        pad_frames = np.repeat(patches[-1:], TEMPORAL_PATCH_SIZE - 1, axis=0)
-        patches = np.concatenate([patches, pad_frames], axis=0)
-    
-    channel, grid_t = patches.shape[1], patches.shape[0] // TEMPORAL_PATCH_SIZE
-    grid_h, grid_w = h2 // PATCH_SIZE, w2 // PATCH_SIZE
-    
-    # Reshape and flatten patches
-    patches = patches.reshape(
-        grid_t,
-        TEMPORAL_PATCH_SIZE,
-        channel,
-        grid_h // MERGE_SIZE,
-        MERGE_SIZE,
-        PATCH_SIZE,
-        grid_w // MERGE_SIZE,
-        MERGE_SIZE,
-        PATCH_SIZE,
-    )
-    
-    patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
-    flatten_patches = patches.reshape(grid_t * grid_h * grid_w, 
-                                     channel * TEMPORAL_PATCH_SIZE * PATCH_SIZE * PATCH_SIZE)
-    
-    return flatten_patches[np.newaxis, :], np.array([[grid_t, grid_h, grid_w]], dtype=np.int64)
+import onnxruntime_genai as og
 
 TOOL_CALL_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
 
@@ -114,63 +26,69 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
     if not image_path.is_file():
         raise FileNotFoundError(f"Image file not found: {image_path}")
 
-    pixel_values, grid_thw_array = load_prepatched_embeddings(image_path)
-    grid_thw = grid_thw_array[0]
-    
+    # Load model and create multimodal processor (uses C++ Qwen2_5VLImageProcessor)
     model = og.Model(str(config_dir))
-    tokenizer_hf = AutoTokenizer.from_pretrained(str(config_dir), trust_remote_code=True)
-    tokenizer_ort = og.Tokenizer(model)
-
-    # Build prompt with image tokens
-    num_image_tokens = (grid_thw[0] * grid_thw[1] * grid_thw[2]) // (MERGE_SIZE ** 2)
-    image_text = VISION_START + IMAGE_PAD_TOKEN * num_image_tokens + VISION_END
+    
+    tokenizer = og.Tokenizer(model)
+    
+    processor = model.create_multimodal_processor()
+    tokenizer_stream = processor.create_stream()
+    
+    # Load image using GenAI's image loader (internally uses onnxruntime-extensions)
+    images = og.Images.open(str(image_path))
+    
+    # Build conversation with prompt
     conversation = [
         {"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT},
-        {"role": "user", "content": image_text + prompt_text},
+        {"role": "user", "content": prompt_text},
     ]
-    prompt = tokenizer_hf.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
-    prompt = prompt.replace(IMAGE_PAD_TOKEN, IMAGE_PAD_TOKEN * num_image_tokens, 1)
-    input_ids = np.array(tokenizer_hf.encode(prompt), dtype=np.int32)
+    
+    # Apply chat template to format the conversation
+    message_json = json.dumps(conversation)
+    prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
+    
+    # Process prompt and images together
+    # The C++ processor will automatically:
+    # 1. Preprocess images using processor_config.json pipeline
+    # 2. Insert image tokens in the correct places
+    # 3. Return properly formatted inputs (pixel_values, image_grid_thw, input_ids)
+    inputs = processor(prompt, images=images)
 
     # Setup generation parameters
     try:
         with open(config_dir / "genai_config.json", "r") as f:
             config = json.load(f)
-            context_len = config.get("model", {}).get("context_length", input_ids.shape[0] + max_new_tokens)
+            context_len = config.get("model", {}).get("context_length", 2048)
             eos_val = config.get("model", {}).get("eos_token_id", [])
             eos_ids = eos_val if isinstance(eos_val, list) else [eos_val] if eos_val else []
     except Exception:
-        context_len = input_ids.shape[0] + max_new_tokens
+        context_len = 2048
         eos_ids = []
     
+    # Use max_length from config if available, otherwise use context_length
+    max_length = min(context_len, 2048)  # Cap at 2048 for generation
+    
     params = og.GeneratorParams(model)
-    params.set_search_options(max_length=context_len, temperature=temperature, top_k=top_k, top_p=top_p,
+    params.set_search_options(max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p,
                               do_sample=do_sample, min_length=min_length, repetition_penalty=repetition_penalty)
-
-    generator = og.Generator(model, params)
-    generator.set_model_input("pixel_values", np.ascontiguousarray(pixel_values.astype(np.float32)))
-    generator.set_model_input("image_grid_thw", np.ascontiguousarray(grid_thw_array.astype(np.int64)))
-    generator.append_tokens(input_ids)
     
-    # Generate and extract tool_call
-    stream = tokenizer_ort.create_stream()
+    # Generate
+    generator = og.Generator(model, params)
+    generator.set_inputs(inputs)
     output_tokens = []
     accum_text = ""
     started_toolcall = False
     print("\n=== Generating ===")
     
-    for step in range(max_new_tokens):
-        if generator.is_done():
-            break
-        
+    while not generator.is_done():
         generator.generate_next_token()
         token = generator.get_next_tokens()[0]
         output_tokens.append(token)
         
-        if eos_ids and token in eos_ids and step >= min_length:
+        if eos_ids and token in eos_ids and len(output_tokens) >= min_length:
             break
         
-        decoded = stream.decode(token)
+        decoded = tokenizer_stream.decode(token)
         accum_text += decoded
         
         if not started_toolcall and "<tool_call>" in accum_text:
@@ -184,7 +102,7 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
                 break
     
     print("\n=== Generation Complete ===")
-    full_output = tokenizer_hf.decode(np.array(output_tokens, dtype=np.int32))
+    full_output = processor.decode(output_tokens)
     
     if started_toolcall and "</tool_call>" not in accum_text:
         print("[WARNING] Incomplete <tool_call> structure")
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 59eef435ed..573bd09af9 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -20,6 +20,7 @@
 #include "marian.h"
 #include "decoder_only_pipeline.h"
 #include "qwen_vl_model.h"
+#include "qwen2_5_vl_image_processor.h"
 #include "../dml/interface.h"
 
 #if defined(_WIN32)
@@ -1291,7 +1292,8 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
           {"phi3v", Processor::Create<PhiImageProcessor>},
           {"whisper", Processor::Create<WhisperProcessor>},
           {"phi4mm", Processor::Create<PhiMultiModalProcessor>},
-          {"gemma3", Processor::Create<GemmaImageProcessor>}} {
+          {"gemma3", Processor::Create<GemmaImageProcessor>},
+          {"qwen2_5_vl", Processor::Create<Qwen2_5VLImageProcessor>}} {
   auto processor = processor_factory_.find(config.model.type);
   if (processor != processor_factory_.end()) {
     processor_ = processor->second(config, session_info);
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
new file mode 100644
index 0000000000..e1bbe61b7b
--- /dev/null
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -0,0 +1,82 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "../generators.h"
+#include "model.h"
+#include "qwen2_5_vl_image_processor.h"
+#include <numeric>
+
+namespace Generators {
+
+Qwen2_5VLImageProcessor::Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info) {
+  const auto processor_config = (config.config_path / fs::path("processor_config.json")).string();
+  if (!fs::exists(config.config_path / fs::path("processor_config.json"))) {
+    throw std::runtime_error("processor_config.json not found at: " + processor_config);
+  }
+
+  CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str()));
+
+  auto input_names = session_info.GetInputNames();
+  for (const auto& input_name : input_names) {
+    if (input_name.find("pixel_values") != std::string::npos) {
+      pixel_values_name_ = input_name;
+    } else if (input_name.find("image_grid_thw") != std::string::npos) {
+      image_grid_thw_name_ = input_name;
+    }
+  }
+}
+
+std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
+  if (!payload.images) {
+    throw std::runtime_error("No images provided to Qwen2.5VLImageProcessor");
+  }
+
+  std::string prompt = std::string(payload.prompt);
+  Ort::Allocator& allocator{Ort::Allocator::GetWithDefaultOptions()};
+  auto named_tensors = std::make_unique<NamedTensors>();
+
+  const std::vector<int32_t> input_ids = tokenizer.Encode(prompt.c_str());
+  std::unique_ptr<OrtValue> input_ids_value = OrtValue::CreateTensor<int32_t>(
+      allocator, std::vector<int64_t>{1, static_cast<int64_t>(input_ids.size())});
+  std::copy(input_ids.begin(), input_ids.end(), input_ids_value->GetTensorMutableData<int32_t>());
+  named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared<Tensor>(std::move(input_ids_value)));
+
+  // Run image preprocessing using onnxruntime-extensions
+  // This will execute the full pipeline from processor_config.json:
+  // DecodeImage -> ConvertRGB -> Resize (smart_resize) -> Rescale -> Normalize -> PatchImage
+  ort_extensions::OrtxObjectPtr<OrtxTensorResult> result;
+  CheckResult(OrtxImagePreProcess(processor_.get(), payload.images->images_.get(), result.ToBeAssigned()));
+
+  OrtxTensor* pixel_values = nullptr;
+  CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values));
+
+  named_tensors->emplace(pixel_values_name_, std::make_shared<Tensor>(ProcessTensor<float>(pixel_values, allocator)));
+
+  const void* pixel_values_data{};
+  const int64_t* pixel_values_shape{};
+  size_t pixel_values_dims{};
+  CheckResult(OrtxGetTensorData(pixel_values, &pixel_values_data, &pixel_values_shape, &pixel_values_dims));
+  
+  if (pixel_values_dims >= 2) {
+    int64_t batch_size = pixel_values_shape[0];
+    int64_t num_patches = pixel_values_shape[1];
+
+    int64_t grid_t = 1;  // Single frame
+    int64_t grid_h = static_cast<int64_t>(std::sqrt(num_patches));
+    int64_t grid_w = num_patches / grid_h;
+    
+    std::vector<int64_t> grid_thw_shape = {batch_size, 3};
+    auto grid_thw_tensor = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
+    
+    auto* dst = grid_thw_tensor->GetTensorMutableData<int64_t>();
+    dst[0] = grid_t;
+    dst[1] = grid_h;
+    dst[2] = grid_w;
+    
+    named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(std::move(grid_thw_tensor)));
+  }
+
+  return named_tensors;
+}
+
+}  // namespace Generators
diff --git a/src/models/qwen2_5_vl_image_processor.h b/src/models/qwen2_5_vl_image_processor.h
new file mode 100644
index 0000000000..85430dbb5d
--- /dev/null
+++ b/src/models/qwen2_5_vl_image_processor.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "model.h"
+#include "processor.h"
+#include "ortx_processor.h"
+
+namespace Generators {
+
+struct Qwen2_5VLImageProcessor : Processor {
+  Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info);
+
+  std::unique_ptr<NamedTensors> Process(const Tokenizer& tokenizer, const Payload& payload) const override;
+
+ private:
+  ort_extensions::OrtxObjectPtr<OrtxProcessor> processor_;
+  std::string pixel_values_name_{"pixel_values"};
+  std::string image_grid_thw_name_{"image_grid_thw"};
+};
+
+}  // namespace Generators

From d98930643c41f9f605439b35ddd15d21b4829e3e Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 15:59:36 -0800
Subject: [PATCH 06/25] Update model name to Fara

---
 ...en2_5_vl_inference.py => fara_inference.py} |  4 ++--
 ..._processor.cpp => fara_image_processor.cpp} |  8 ++++----
 ...mage_processor.h => fara_image_processor.h} |  4 ++--
 .../{qwen_vl_model.cpp => fara_vl_model.cpp}   | 18 +++++++++---------
 .../{qwen_vl_model.h => fara_vl_model.h}       | 14 +++++++-------
 .../{qwen_vl_vision.cpp => fara_vl_vision.cpp} |  4 ++--
 .../{qwen_vl_vision.h => fara_vl_vision.h}     |  4 ++--
 src/models/model.cpp                           | 10 +++++-----
 src/models/model_type.h                        |  2 +-
 9 files changed, 34 insertions(+), 34 deletions(-)
 rename examples/python/{qwen2_5_vl_inference.py => fara_inference.py} (98%)
 rename src/models/{qwen2_5_vl_image_processor.cpp => fara_image_processor.cpp} (90%)
 rename src/models/{qwen2_5_vl_image_processor.h => fara_image_processor.h} (81%)
 rename src/models/{qwen_vl_model.cpp => fara_vl_model.cpp} (86%)
 rename src/models/{qwen_vl_model.h => fara_vl_model.h} (76%)
 rename src/models/{qwen_vl_vision.cpp => fara_vl_vision.cpp} (98%)
 rename src/models/{qwen_vl_vision.h => fara_vl_vision.h} (95%)

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/fara_inference.py
similarity index 98%
rename from examples/python/qwen2_5_vl_inference.py
rename to examples/python/fara_inference.py
index 98f497cc81..71cbe946a1 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/fara_inference.py
@@ -26,7 +26,7 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
     if not image_path.is_file():
         raise FileNotFoundError(f"Image file not found: {image_path}")
 
-    # Load model and create multimodal processor (uses C++ Qwen2_5VLImageProcessor)
+    # Load model and create multimodal processor (uses C++ FaraImageProcessor)
     model = og.Model(str(config_dir))
     
     tokenizer = og.Tokenizer(model)
@@ -112,7 +112,7 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Qwen2.5-VL inference using onnxruntime-genai")
+    parser = argparse.ArgumentParser(description="Fara VLM inference using onnxruntime-genai")
     parser.add_argument("--config_dir", "--model_path", type=Path, required=True, help="Directory with genai_config.json")
     parser.add_argument("--image", type=Path, required=True, help="Path to input image")
     parser.add_argument("--prompt", type=str, default="Describe the image.", help="User text prompt")
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/fara_image_processor.cpp
similarity index 90%
rename from src/models/qwen2_5_vl_image_processor.cpp
rename to src/models/fara_image_processor.cpp
index e1bbe61b7b..68fe5c5230 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/fara_image_processor.cpp
@@ -3,12 +3,12 @@
 
 #include "../generators.h"
 #include "model.h"
-#include "qwen2_5_vl_image_processor.h"
+#include "fara_image_processor.h"
 #include <numeric>
 
 namespace Generators {
 
-Qwen2_5VLImageProcessor::Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info) {
+FaraImageProcessor::FaraImageProcessor(Config& config, const SessionInfo& session_info) {
   const auto processor_config = (config.config_path / fs::path("processor_config.json")).string();
   if (!fs::exists(config.config_path / fs::path("processor_config.json"))) {
     throw std::runtime_error("processor_config.json not found at: " + processor_config);
@@ -26,9 +26,9 @@ Qwen2_5VLImageProcessor::Qwen2_5VLImageProcessor(Config& config, const SessionIn
   }
 }
 
-std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
+std::unique_ptr<NamedTensors> FaraImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
   if (!payload.images) {
-    throw std::runtime_error("No images provided to Qwen2.5VLImageProcessor");
+    throw std::runtime_error("No images provided to FaraImageProcessor");
   }
 
   std::string prompt = std::string(payload.prompt);
diff --git a/src/models/qwen2_5_vl_image_processor.h b/src/models/fara_image_processor.h
similarity index 81%
rename from src/models/qwen2_5_vl_image_processor.h
rename to src/models/fara_image_processor.h
index 85430dbb5d..46bfbd6a79 100644
--- a/src/models/qwen2_5_vl_image_processor.h
+++ b/src/models/fara_image_processor.h
@@ -9,8 +9,8 @@
 
 namespace Generators {
 
-struct Qwen2_5VLImageProcessor : Processor {
-  Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info);
+struct FaraImageProcessor : Processor {
+  FaraImageProcessor(Config& config, const SessionInfo& session_info);
 
   std::unique_ptr<NamedTensors> Process(const Tokenizer& tokenizer, const Payload& payload) const override;
 
diff --git a/src/models/qwen_vl_model.cpp b/src/models/fara_vl_model.cpp
similarity index 86%
rename from src/models/qwen_vl_model.cpp
rename to src/models/fara_vl_model.cpp
index 673a484171..c3236ed853 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/fara_vl_model.cpp
@@ -1,4 +1,4 @@
-#include "qwen_vl_model.h"
+#include "fara_vl_model.h"
 #include "model.h"
 #include "onnxruntime_api.h"
 #include "../logging.h"
@@ -7,7 +7,7 @@
 
 namespace Generators {
 
-Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
+Fara_PipelineModel::Fara_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
   : DecoderOnlyPipelineModel(std::move(config), ort_env) {  
   if (config_->model.vision.pipeline.empty() || !config_->model.vision.window_indexing.has_value()) return;
 
@@ -42,18 +42,18 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
     spatial_merge, wnd_idx_path, use_qnn_attn);
 }
 
-std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
-                                                             const GeneratorParams& params) const {
-  return std::make_unique<Qwen2_5_VL_PipelineState>(*this, sequence_lengths, params);
+std::unique_ptr<State> Fara_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
+                                                               const GeneratorParams& params) const {
+  return std::make_unique<Fara_PipelineState>(*this, sequence_lengths, params);
 }
 
-Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
+Fara_PipelineState::Fara_PipelineState(const Fara_PipelineModel& model,
                                                    DeviceSpan<int32_t> sequence_lengths,
                                                    const GeneratorParams& params)
   : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} {
 }
 
-void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
+void Fara_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
   DecoderOnlyPipelineState::SetExtraInputs(extra_inputs);
   
   if (vision_ran_ || !vl_model_.vision_pipeline_) return;
@@ -91,7 +91,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   vision_ran_ = true;
 }
 
-void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
+void Fara_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
   if (stage_id != 0 || !vision_ran_) return;
   
   const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
@@ -100,7 +100,7 @@ void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32
   }
 }
 
-void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
+void Fara_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
                                                      DeviceSpan<int32_t>& input_token_ids) {
   auto it = ortvalue_store_.find(embeddings_output_name);
   if (it == ortvalue_store_.end() || !it->second) return;
diff --git a/src/models/qwen_vl_model.h b/src/models/fara_vl_model.h
similarity index 76%
rename from src/models/qwen_vl_model.h
rename to src/models/fara_vl_model.h
index 27f12649c3..f4bd27b5af 100644
--- a/src/models/qwen_vl_model.h
+++ b/src/models/fara_vl_model.h
@@ -1,16 +1,16 @@
 #pragma once
 
 #include "decoder_only_pipeline.h"
-#include "qwen_vl_vision.h"
+#include "fara_vl_vision.h"
 
 namespace Generators {
 
-// Qwen2.5-VL pipeline model integrating vision pipeline + decoder pipeline.
+// Fara VLM pipeline model integrating vision pipeline + decoder pipeline.
 // Loads decoder pipeline sessions (handled by base) and constructs vision pipeline sessions.
 // State runs vision once (on first SetExtraInputs when pixel_values arrives) to produce image_features
 // which are injected into embeddings output via existing injection logic in DecoderOnlyPipelineState.
-struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel {
-  Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env);
+struct Fara_PipelineModel : public DecoderOnlyPipelineModel {
+  Fara_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env);
 
   std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths,
                                      const GeneratorParams& params) const override;
@@ -19,8 +19,8 @@ struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel {
   std::unique_ptr<QwenVisionPipeline> vision_pipeline_;
 };
 
-struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
-  Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
+struct Fara_PipelineState : public DecoderOnlyPipelineState {
+  Fara_PipelineState(const Fara_PipelineModel& model,
                            DeviceSpan<int32_t> sequence_lengths,
                            const GeneratorParams& params);
 
@@ -33,7 +33,7 @@ struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
   void InjectVisionEmbeddings(const std::string& embeddings_output_name,
                              DeviceSpan<int32_t>& input_token_ids);
   
-  const Qwen2_5_VL_PipelineModel& vl_model_;
+  const Fara_PipelineModel& vl_model_;
   bool vision_ran_{false};
   std::unique_ptr<OrtValue> image_features_value_;
   std::vector<float> image_features_buffer_; // backing storage for OrtValue
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/fara_vl_vision.cpp
similarity index 98%
rename from src/models/qwen_vl_vision.cpp
rename to src/models/fara_vl_vision.cpp
index e3a5eef8c5..87a9845d7c 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/fara_vl_vision.cpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-// Qwen VL Vision pipeline implementation with optional QNN EP for vision attention stage.
+// Fara VLM Vision pipeline implementation with optional QNN EP for vision attention stage.
 
-#include "qwen_vl_vision.h"
+#include "fara_vl_vision.h"
 
 #include <fstream>
 #include <stdexcept>
diff --git a/src/models/qwen_vl_vision.h b/src/models/fara_vl_vision.h
similarity index 95%
rename from src/models/qwen_vl_vision.h
rename to src/models/fara_vl_vision.h
index 64d2171096..d45e39f14e 100644
--- a/src/models/qwen_vl_vision.h
+++ b/src/models/fara_vl_vision.h
@@ -1,14 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 //
-// Qwen VL Vision pipeline support (initial skeleton).
+// Fara VLM Vision pipeline support (initial skeleton).
 // Executes three ONNX models in sequence:
 //   1) Patch Embedding  : pixel_values -> hidden
 //   2) Vision Attention : hidden -> hidden
 //   3) Patch Merger      : hidden -> merged embeddings
 // Performs window expansion/reordering using wnd_idx, then final reverse ordering.
 //
-// This is a minimal starting point to integrate Qwen2.5-VL vision processing
+// This is a minimal starting point to integrate Fara VLM vision processing
 // into onnxruntime-genai. Further work will: (a) connect to Config parsing,
 // (b) expose via MultiModal pipeline, (c) add EP selection, (d) reuse buffers.
 
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 573bd09af9..58b457d2fd 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -19,8 +19,8 @@
 #include "multi_modal.h"
 #include "marian.h"
 #include "decoder_only_pipeline.h"
-#include "qwen_vl_model.h"
-#include "qwen2_5_vl_image_processor.h"
+#include "fara_vl_model.h"
+#include "fara_image_processor.h"
 #include "../dml/interface.h"
 
 #if defined(_WIN32)
@@ -1195,8 +1195,8 @@ std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, con
 }
 
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, std::unique_ptr<Config> config) {
-  if (config->model.type == "qwen2_5_vl")
-    return std::make_shared<Qwen2_5_VL_PipelineModel>(std::move(config), ort_env);
+  if (config->model.type == "fara")
+    return std::make_shared<Fara_PipelineModel>(std::move(config), ort_env);
   if (config->model.type == "gpt2")
     return std::make_shared<Gpt_Model>(std::move(config), ort_env);
   if (ModelType::IsLLM(config->model.type))
@@ -1293,7 +1293,7 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
           {"whisper", Processor::Create<WhisperProcessor>},
           {"phi4mm", Processor::Create<PhiMultiModalProcessor>},
           {"gemma3", Processor::Create<GemmaImageProcessor>},
-          {"qwen2_5_vl", Processor::Create<Qwen2_5VLImageProcessor>}} {
+          {"fara", Processor::Create<FaraImageProcessor>}} {
   auto processor = processor_factory_.find(config.model.type);
   if (processor != processor_factory_.end()) {
     processor_ = processor->second(config, session_info);
diff --git a/src/models/model_type.h b/src/models/model_type.h
index 8a71e0e105..016c746716 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,7 +18,7 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "qwen2_5_vl"};
+    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "fara"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 

From 0250dc89b9817c962be6f04c23fa5ef79706db16 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 16:31:33 -0800
Subject: [PATCH 07/25] Update name

---
 src/models/fara_vl_model.cpp  | 2 +-
 src/models/fara_vl_model.h    | 2 +-
 src/models/fara_vl_vision.cpp | 6 +++---
 src/models/fara_vl_vision.h   | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/models/fara_vl_model.cpp b/src/models/fara_vl_model.cpp
index c3236ed853..6c0ba737d1 100644
--- a/src/models/fara_vl_model.cpp
+++ b/src/models/fara_vl_model.cpp
@@ -37,7 +37,7 @@ Fara_PipelineModel::Fara_PipelineModel(std::unique_ptr<Config> config, OrtEnv& o
   auto wnd_idx_path = (config_->config_path / fs::path(config_->model.vision.window_indexing->filename)).string();
   int spatial_merge = config_->model.vision.window_indexing->spatial_merge_size;
   
-  vision_pipeline_ = std::make_unique<QwenVisionPipeline>(
+  vision_pipeline_ = std::make_unique<FaraVisionPipeline>(
     ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
     spatial_merge, wnd_idx_path, use_qnn_attn);
 }
diff --git a/src/models/fara_vl_model.h b/src/models/fara_vl_model.h
index f4bd27b5af..0c541db27f 100644
--- a/src/models/fara_vl_model.h
+++ b/src/models/fara_vl_model.h
@@ -16,7 +16,7 @@ struct Fara_PipelineModel : public DecoderOnlyPipelineModel {
                                      const GeneratorParams& params) const override;
 
   // Vision pipeline shared across states (sessions reused).
-  std::unique_ptr<QwenVisionPipeline> vision_pipeline_;
+  std::unique_ptr<FaraVisionPipeline> vision_pipeline_;
 };
 
 struct Fara_PipelineState : public DecoderOnlyPipelineState {
diff --git a/src/models/fara_vl_vision.cpp b/src/models/fara_vl_vision.cpp
index 87a9845d7c..895d2361c2 100644
--- a/src/models/fara_vl_vision.cpp
+++ b/src/models/fara_vl_vision.cpp
@@ -77,7 +77,7 @@ std::vector<int64_t> Load1DNpyIndices(const std::string& file_path) {
   return result;
 }
 
-QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
+FaraVisionPipeline::FaraVisionPipeline(OrtEnv& env,
                                        const std::string& patch_embed_model,
                                        const std::string& vision_attn_model,
                                        const std::string& patch_merger_model,
@@ -131,7 +131,7 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
   for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
 }
 
-std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
+std::unique_ptr<OrtValue> FaraVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
   auto memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
   std::span<float> data_span(const_cast<float*>(data), count);
   std::span<const int64_t> shape_span(shape.data(), shape.size());
@@ -140,7 +140,7 @@ std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, si
 
 // Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store).
 
-std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape) {
+std::vector<float> FaraVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape) {
   if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
     throw std::runtime_error("Vision pipeline sessions not initialized");
   }
diff --git a/src/models/fara_vl_vision.h b/src/models/fara_vl_vision.h
index d45e39f14e..24b47e7414 100644
--- a/src/models/fara_vl_vision.h
+++ b/src/models/fara_vl_vision.h
@@ -28,8 +28,8 @@ namespace Generators {
 std::vector<int64_t> Load1DNpyIndices(const std::string& file_path);
 
 // Internal vision pipeline (no external DLL interface required after Python binding removal).
-struct QwenVisionPipeline {
-  QwenVisionPipeline(OrtEnv& env,
+struct FaraVisionPipeline {
+  FaraVisionPipeline(OrtEnv& env,
                      const std::string& patch_embed_model,
                      const std::string& vision_attn_model,
                      const std::string& patch_merger_model,
@@ -40,8 +40,8 @@ struct QwenVisionPipeline {
   bool use_qnn_attn_{};
   std::string qnn_backend_path_{};
 
-  QwenVisionPipeline(const QwenVisionPipeline&) = delete;
-  QwenVisionPipeline& operator=(const QwenVisionPipeline&) = delete;
+  FaraVisionPipeline(const FaraVisionPipeline&) = delete;
+  FaraVisionPipeline& operator=(const FaraVisionPipeline&) = delete;
 
   // Run vision pipeline.
   // pixel_values: float32 tensor with shape [S, C] or [B, C, H, W] depending on export (caller provides shape).

From a82de4a71946f14012366ca9e95d3e4122b224d6 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Tue, 2 Dec 2025 17:16:03 -0800
Subject: [PATCH 08/25] Add position ids back as input

---
 src/config.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/config.cpp b/src/config.cpp
index 85706eac6b..28bbca39f8 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -279,6 +279,8 @@ struct DecoderInputs_Element : JSON::Element {
       v_.input_ids = JSON::Get<std::string_view>(value);
     } else if (name == "inputs_embeds") {
       v_.embeddings = JSON::Get<std::string_view>(value);
+    } else if (name == "position_ids") {
+      v_.position_ids = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
     } else if (name == "past_key_names") {

From 499c88ca967d58da1ceaba3a7739fbf4d5b3591e Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 12:33:04 -0800
Subject: [PATCH 09/25] Revert "Update model name to Fara"

This reverts commit 509e0eb686b54dfcf319141bfa5136bca69ba3e7.
---
 ...ra_inference.py => qwen2_5_vl_inference.py} |  4 ++--
 src/models/model.cpp                           | 10 +++++-----
 src/models/model_type.h                        |  2 +-
 ...ssor.cpp => qwen2_5_vl_image_processor.cpp} |  8 ++++----
 ...rocessor.h => qwen2_5_vl_image_processor.h} |  4 ++--
 .../{fara_vl_model.cpp => qwen_vl_model.cpp}   | 18 +++++++++---------
 .../{fara_vl_model.h => qwen_vl_model.h}       | 14 +++++++-------
 .../{fara_vl_vision.cpp => qwen_vl_vision.cpp} |  4 ++--
 .../{fara_vl_vision.h => qwen_vl_vision.h}     |  4 ++--
 9 files changed, 34 insertions(+), 34 deletions(-)
 rename examples/python/{fara_inference.py => qwen2_5_vl_inference.py} (98%)
 rename src/models/{fara_image_processor.cpp => qwen2_5_vl_image_processor.cpp} (90%)
 rename src/models/{fara_image_processor.h => qwen2_5_vl_image_processor.h} (81%)
 rename src/models/{fara_vl_model.cpp => qwen_vl_model.cpp} (86%)
 rename src/models/{fara_vl_model.h => qwen_vl_model.h} (76%)
 rename src/models/{fara_vl_vision.cpp => qwen_vl_vision.cpp} (98%)
 rename src/models/{fara_vl_vision.h => qwen_vl_vision.h} (95%)

diff --git a/examples/python/fara_inference.py b/examples/python/qwen2_5_vl_inference.py
similarity index 98%
rename from examples/python/fara_inference.py
rename to examples/python/qwen2_5_vl_inference.py
index 71cbe946a1..98f497cc81 100644
--- a/examples/python/fara_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -26,7 +26,7 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
     if not image_path.is_file():
         raise FileNotFoundError(f"Image file not found: {image_path}")
 
-    # Load model and create multimodal processor (uses C++ FaraImageProcessor)
+    # Load model and create multimodal processor (uses C++ Qwen2_5VLImageProcessor)
     model = og.Model(str(config_dir))
     
     tokenizer = og.Tokenizer(model)
@@ -112,7 +112,7 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Fara VLM inference using onnxruntime-genai")
+    parser = argparse.ArgumentParser(description="Qwen2.5-VL inference using onnxruntime-genai")
     parser.add_argument("--config_dir", "--model_path", type=Path, required=True, help="Directory with genai_config.json")
     parser.add_argument("--image", type=Path, required=True, help="Path to input image")
     parser.add_argument("--prompt", type=str, default="Describe the image.", help="User text prompt")
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 58b457d2fd..573bd09af9 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -19,8 +19,8 @@
 #include "multi_modal.h"
 #include "marian.h"
 #include "decoder_only_pipeline.h"
-#include "fara_vl_model.h"
-#include "fara_image_processor.h"
+#include "qwen_vl_model.h"
+#include "qwen2_5_vl_image_processor.h"
 #include "../dml/interface.h"
 
 #if defined(_WIN32)
@@ -1195,8 +1195,8 @@ std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, con
 }
 
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, std::unique_ptr<Config> config) {
-  if (config->model.type == "fara")
-    return std::make_shared<Fara_PipelineModel>(std::move(config), ort_env);
+  if (config->model.type == "qwen2_5_vl")
+    return std::make_shared<Qwen2_5_VL_PipelineModel>(std::move(config), ort_env);
   if (config->model.type == "gpt2")
     return std::make_shared<Gpt_Model>(std::move(config), ort_env);
   if (ModelType::IsLLM(config->model.type))
@@ -1293,7 +1293,7 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
           {"whisper", Processor::Create<WhisperProcessor>},
           {"phi4mm", Processor::Create<PhiMultiModalProcessor>},
           {"gemma3", Processor::Create<GemmaImageProcessor>},
-          {"fara", Processor::Create<FaraImageProcessor>}} {
+          {"qwen2_5_vl", Processor::Create<Qwen2_5VLImageProcessor>}} {
   auto processor = processor_factory_.find(config.model.type);
   if (processor != processor_factory_.end()) {
     processor_ = processor->second(config, session_info);
diff --git a/src/models/model_type.h b/src/models/model_type.h
index 016c746716..8a71e0e105 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,7 +18,7 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "fara"};
+    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "qwen2_5_vl"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 
diff --git a/src/models/fara_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
similarity index 90%
rename from src/models/fara_image_processor.cpp
rename to src/models/qwen2_5_vl_image_processor.cpp
index 68fe5c5230..e1bbe61b7b 100644
--- a/src/models/fara_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -3,12 +3,12 @@
 
 #include "../generators.h"
 #include "model.h"
-#include "fara_image_processor.h"
+#include "qwen2_5_vl_image_processor.h"
 #include <numeric>
 
 namespace Generators {
 
-FaraImageProcessor::FaraImageProcessor(Config& config, const SessionInfo& session_info) {
+Qwen2_5VLImageProcessor::Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info) {
   const auto processor_config = (config.config_path / fs::path("processor_config.json")).string();
   if (!fs::exists(config.config_path / fs::path("processor_config.json"))) {
     throw std::runtime_error("processor_config.json not found at: " + processor_config);
@@ -26,9 +26,9 @@ FaraImageProcessor::FaraImageProcessor(Config& config, const SessionInfo& sessio
   }
 }
 
-std::unique_ptr<NamedTensors> FaraImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
+std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
   if (!payload.images) {
-    throw std::runtime_error("No images provided to FaraImageProcessor");
+    throw std::runtime_error("No images provided to Qwen2.5VLImageProcessor");
   }
 
   std::string prompt = std::string(payload.prompt);
diff --git a/src/models/fara_image_processor.h b/src/models/qwen2_5_vl_image_processor.h
similarity index 81%
rename from src/models/fara_image_processor.h
rename to src/models/qwen2_5_vl_image_processor.h
index 46bfbd6a79..85430dbb5d 100644
--- a/src/models/fara_image_processor.h
+++ b/src/models/qwen2_5_vl_image_processor.h
@@ -9,8 +9,8 @@
 
 namespace Generators {
 
-struct FaraImageProcessor : Processor {
-  FaraImageProcessor(Config& config, const SessionInfo& session_info);
+struct Qwen2_5VLImageProcessor : Processor {
+  Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info);
 
   std::unique_ptr<NamedTensors> Process(const Tokenizer& tokenizer, const Payload& payload) const override;
 
diff --git a/src/models/fara_vl_model.cpp b/src/models/qwen_vl_model.cpp
similarity index 86%
rename from src/models/fara_vl_model.cpp
rename to src/models/qwen_vl_model.cpp
index 6c0ba737d1..b2e412dd6b 100644
--- a/src/models/fara_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -1,4 +1,4 @@
-#include "fara_vl_model.h"
+#include "qwen_vl_model.h"
 #include "model.h"
 #include "onnxruntime_api.h"
 #include "../logging.h"
@@ -7,7 +7,7 @@
 
 namespace Generators {
 
-Fara_PipelineModel::Fara_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
+Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
   : DecoderOnlyPipelineModel(std::move(config), ort_env) {  
   if (config_->model.vision.pipeline.empty() || !config_->model.vision.window_indexing.has_value()) return;
 
@@ -42,18 +42,18 @@ Fara_PipelineModel::Fara_PipelineModel(std::unique_ptr<Config> config, OrtEnv& o
     spatial_merge, wnd_idx_path, use_qnn_attn);
 }
 
-std::unique_ptr<State> Fara_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
-                                                               const GeneratorParams& params) const {
-  return std::make_unique<Fara_PipelineState>(*this, sequence_lengths, params);
+std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
+                                                             const GeneratorParams& params) const {
+  return std::make_unique<Qwen2_5_VL_PipelineState>(*this, sequence_lengths, params);
 }
 
-Fara_PipelineState::Fara_PipelineState(const Fara_PipelineModel& model,
+Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
                                                    DeviceSpan<int32_t> sequence_lengths,
                                                    const GeneratorParams& params)
   : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} {
 }
 
-void Fara_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
+void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
   DecoderOnlyPipelineState::SetExtraInputs(extra_inputs);
   
   if (vision_ran_ || !vl_model_.vision_pipeline_) return;
@@ -91,7 +91,7 @@ void Fara_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inp
   vision_ran_ = true;
 }
 
-void Fara_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
+void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
   if (stage_id != 0 || !vision_ran_) return;
   
   const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
@@ -100,7 +100,7 @@ void Fara_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& n
   }
 }
 
-void Fara_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
+void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
                                                      DeviceSpan<int32_t>& input_token_ids) {
   auto it = ortvalue_store_.find(embeddings_output_name);
   if (it == ortvalue_store_.end() || !it->second) return;
diff --git a/src/models/fara_vl_model.h b/src/models/qwen_vl_model.h
similarity index 76%
rename from src/models/fara_vl_model.h
rename to src/models/qwen_vl_model.h
index 0c541db27f..82d89afa67 100644
--- a/src/models/fara_vl_model.h
+++ b/src/models/qwen_vl_model.h
@@ -1,16 +1,16 @@
 #pragma once
 
 #include "decoder_only_pipeline.h"
-#include "fara_vl_vision.h"
+#include "qwen_vl_vision.h"
 
 namespace Generators {
 
-// Fara VLM pipeline model integrating vision pipeline + decoder pipeline.
+// Qwen2.5-VL pipeline model integrating vision pipeline + decoder pipeline.
 // Loads decoder pipeline sessions (handled by base) and constructs vision pipeline sessions.
 // State runs vision once (on first SetExtraInputs when pixel_values arrives) to produce image_features
 // which are injected into embeddings output via existing injection logic in DecoderOnlyPipelineState.
-struct Fara_PipelineModel : public DecoderOnlyPipelineModel {
-  Fara_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env);
+struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel {
+  Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env);
 
   std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths,
                                      const GeneratorParams& params) const override;
@@ -19,8 +19,8 @@ struct Fara_PipelineModel : public DecoderOnlyPipelineModel {
   std::unique_ptr<FaraVisionPipeline> vision_pipeline_;
 };
 
-struct Fara_PipelineState : public DecoderOnlyPipelineState {
-  Fara_PipelineState(const Fara_PipelineModel& model,
+struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
+  Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
                            DeviceSpan<int32_t> sequence_lengths,
                            const GeneratorParams& params);
 
@@ -33,7 +33,7 @@ struct Fara_PipelineState : public DecoderOnlyPipelineState {
   void InjectVisionEmbeddings(const std::string& embeddings_output_name,
                              DeviceSpan<int32_t>& input_token_ids);
   
-  const Fara_PipelineModel& vl_model_;
+  const Qwen2_5_VL_PipelineModel& vl_model_;
   bool vision_ran_{false};
   std::unique_ptr<OrtValue> image_features_value_;
   std::vector<float> image_features_buffer_; // backing storage for OrtValue
diff --git a/src/models/fara_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
similarity index 98%
rename from src/models/fara_vl_vision.cpp
rename to src/models/qwen_vl_vision.cpp
index 895d2361c2..28101ab438 100644
--- a/src/models/fara_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-// Fara VLM Vision pipeline implementation with optional QNN EP for vision attention stage.
+// Qwen VL Vision pipeline implementation with optional QNN EP for vision attention stage.
 
-#include "fara_vl_vision.h"
+#include "qwen_vl_vision.h"
 
 #include <fstream>
 #include <stdexcept>
diff --git a/src/models/fara_vl_vision.h b/src/models/qwen_vl_vision.h
similarity index 95%
rename from src/models/fara_vl_vision.h
rename to src/models/qwen_vl_vision.h
index 24b47e7414..5e67be306b 100644
--- a/src/models/fara_vl_vision.h
+++ b/src/models/qwen_vl_vision.h
@@ -1,14 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 //
-// Fara VLM Vision pipeline support (initial skeleton).
+// Qwen VL Vision pipeline support (initial skeleton).
 // Executes three ONNX models in sequence:
 //   1) Patch Embedding  : pixel_values -> hidden
 //   2) Vision Attention : hidden -> hidden
 //   3) Patch Merger      : hidden -> merged embeddings
 // Performs window expansion/reordering using wnd_idx, then final reverse ordering.
 //
-// This is a minimal starting point to integrate Fara VLM vision processing
+// This is a minimal starting point to integrate Qwen2.5-VL vision processing
 // into onnxruntime-genai. Further work will: (a) connect to Config parsing,
 // (b) expose via MultiModal pipeline, (c) add EP selection, (d) reuse buffers.
 

From c994f4b61012161edcc4165e82685f5d8d5541b1 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 12:33:30 -0800
Subject: [PATCH 10/25] Revert "Update name"

This reverts commit 111530ebc0260b04db867adfaa8ddf81bfac2fca.
---
 src/models/qwen_vl_model.cpp  | 2 +-
 src/models/qwen_vl_model.h    | 2 +-
 src/models/qwen_vl_vision.cpp | 6 +++---
 src/models/qwen_vl_vision.h   | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index b2e412dd6b..673a484171 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -37,7 +37,7 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
   auto wnd_idx_path = (config_->config_path / fs::path(config_->model.vision.window_indexing->filename)).string();
   int spatial_merge = config_->model.vision.window_indexing->spatial_merge_size;
   
-  vision_pipeline_ = std::make_unique<FaraVisionPipeline>(
+  vision_pipeline_ = std::make_unique<QwenVisionPipeline>(
     ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
     spatial_merge, wnd_idx_path, use_qnn_attn);
 }
diff --git a/src/models/qwen_vl_model.h b/src/models/qwen_vl_model.h
index 82d89afa67..27f12649c3 100644
--- a/src/models/qwen_vl_model.h
+++ b/src/models/qwen_vl_model.h
@@ -16,7 +16,7 @@ struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel {
                                      const GeneratorParams& params) const override;
 
   // Vision pipeline shared across states (sessions reused).
-  std::unique_ptr<FaraVisionPipeline> vision_pipeline_;
+  std::unique_ptr<QwenVisionPipeline> vision_pipeline_;
 };
 
 struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 28101ab438..e3a5eef8c5 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -77,7 +77,7 @@ std::vector<int64_t> Load1DNpyIndices(const std::string& file_path) {
   return result;
 }
 
-FaraVisionPipeline::FaraVisionPipeline(OrtEnv& env,
+QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
                                        const std::string& patch_embed_model,
                                        const std::string& vision_attn_model,
                                        const std::string& patch_merger_model,
@@ -131,7 +131,7 @@ FaraVisionPipeline::FaraVisionPipeline(OrtEnv& env,
   for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
 }
 
-std::unique_ptr<OrtValue> FaraVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
+std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
   auto memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
   std::span<float> data_span(const_cast<float*>(data), count);
   std::span<const int64_t> shape_span(shape.data(), shape.size());
@@ -140,7 +140,7 @@ std::unique_ptr<OrtValue> FaraVisionPipeline::CreateTensor(const float* data, si
 
 // Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store).
 
-std::vector<float> FaraVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape) {
+std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape) {
   if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
     throw std::runtime_error("Vision pipeline sessions not initialized");
   }
diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h
index 5e67be306b..64d2171096 100644
--- a/src/models/qwen_vl_vision.h
+++ b/src/models/qwen_vl_vision.h
@@ -28,8 +28,8 @@ namespace Generators {
 std::vector<int64_t> Load1DNpyIndices(const std::string& file_path);
 
 // Internal vision pipeline (no external DLL interface required after Python binding removal).
-struct FaraVisionPipeline {
-  FaraVisionPipeline(OrtEnv& env,
+struct QwenVisionPipeline {
+  QwenVisionPipeline(OrtEnv& env,
                      const std::string& patch_embed_model,
                      const std::string& vision_attn_model,
                      const std::string& patch_merger_model,
@@ -40,8 +40,8 @@ struct FaraVisionPipeline {
   bool use_qnn_attn_{};
   std::string qnn_backend_path_{};
 
-  FaraVisionPipeline(const FaraVisionPipeline&) = delete;
-  FaraVisionPipeline& operator=(const FaraVisionPipeline&) = delete;
+  QwenVisionPipeline(const QwenVisionPipeline&) = delete;
+  QwenVisionPipeline& operator=(const QwenVisionPipeline&) = delete;
 
   // Run vision pipeline.
   // pixel_values: float32 tensor with shape [S, C] or [B, C, H, W] depending on export (caller provides shape).

From 38de49678ad4c3badfa8d0fe992ecc157384e071 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 12:59:10 -0800
Subject: [PATCH 11/25] Revert name changes

---
 src/models/model.cpp    | 3 ++-
 src/models/model_type.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/models/model.cpp b/src/models/model.cpp
index 573bd09af9..adc355f970 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -1195,7 +1195,7 @@ std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, con
 }
 
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, std::unique_ptr<Config> config) {
-  if (config->model.type == "qwen2_5_vl")
+  if (config->model.type == "fara" || config->model.type == "qwen2_5_vl")
     return std::make_shared<Qwen2_5_VL_PipelineModel>(std::move(config), ort_env);
   if (config->model.type == "gpt2")
     return std::make_shared<Gpt_Model>(std::move(config), ort_env);
@@ -1293,6 +1293,7 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
           {"whisper", Processor::Create<WhisperProcessor>},
           {"phi4mm", Processor::Create<PhiMultiModalProcessor>},
           {"gemma3", Processor::Create<GemmaImageProcessor>},
+          {"fara", Processor::Create<Qwen2_5VLImageProcessor>},
           {"qwen2_5_vl", Processor::Create<Qwen2_5VLImageProcessor>}} {
   auto processor = processor_factory_.find(config.model.type);
   if (processor != processor_factory_.end()) {
diff --git a/src/models/model_type.h b/src/models/model_type.h
index 8a71e0e105..a21c4156cf 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,7 +18,7 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "qwen2_5_vl"};
+    static constexpr std::array<std::string_view, 4> VLM = {"fara", "gemma3", "phi3v", "qwen2_5_vl"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 

From 21d93262ac7787bcd37740390a49bc39f8fb8121 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Wed, 3 Dec 2025 16:41:24 +0800
Subject: [PATCH 12/25] fix QNN loading for vision model

---
 examples/python/qwen2_5_vl_inference.py |  4 +++
 src/models/qwen_vl_vision.cpp           | 38 +++++++++++++++++++++----
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
index 98f497cc81..80af5dd584 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -1,3 +1,7 @@
+import winml
+print(winml.register_execution_providers(ort=False, ort_genai=True))
+
+
 import argparse
 import json
 import sys
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index e3a5eef8c5..080253adfc 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -3,6 +3,7 @@
 // Qwen VL Vision pipeline implementation with optional QNN EP for vision attention stage.
 
 #include "qwen_vl_vision.h"
+#include "../generators.h"
 
 #include <fstream>
 #include <stdexcept>
@@ -105,17 +106,42 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
 
   if (use_qnn_attn_) {
     // Ensure QNN provider is available
-    auto providers = Ort::GetAvailableProviders();
-    bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
-    if (!has_qnn) {
-      throw std::runtime_error("QNNExecutionProvider requested for vision attention but not available in this build");
-    }
+    // auto providers = Ort::GetAvailableProviders();
+    // bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
+    // if (!has_qnn) {
+    //   throw std::runtime_error("QNNExecutionProvider requested for vision attention but not available in this build");
+    // }
     auto so = OrtSessionOptions::Create();
+
+      size_t num_devices = 0;
+      const OrtEpDevice* const* device_ptrs = nullptr;
+      Ort::GetEpDevices(&GetOrtEnv(), &device_ptrs, &num_devices);
+
+      std::vector<const OrtEpDevice*> ep_devices_ptrs;
+      ep_devices_ptrs.reserve(num_devices);
+
+      for (size_t i = 0; i < num_devices; ++i) {
+        if (Ort::api->EpDevice_EpName(device_ptrs[i]) == std::string("QNNExecutionProvider")) {
+          ep_devices_ptrs.push_back(device_ptrs[i]);
+          // std::cout << "added QNN EP for vision" << std::endl;
+        }
+      }
+
+      if (ep_devices_ptrs.empty()) {
+        throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered.");
+      }
+
     so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
     // QNN provider options
     const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
     const char* values[] = { qnn_backend_path_.c_str(), "burst", "3", "60" };
-    so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
+
+    Ort::api->SessionOptionsAppendExecutionProvider_V2(
+      so.get(),
+      &GetOrtEnv(),
+      ep_devices_ptrs.data(), ep_devices_ptrs.size(),
+      keys, values, 4);
+
     vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), so.get());
   } else {
     vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), nullptr);

From 9036f5a044b87af6afc531b14a7a1b520ce73a52 Mon Sep 17 00:00:00 2001
From: Yi Ren <reny@microsoft.com>
Date: Wed, 3 Dec 2025 17:03:31 +0800
Subject: [PATCH 13/25] support both ort-qnn and winml

---
 src/models/qwen_vl_vision.cpp | 39 ++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 080253adfc..8265c7998e 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -106,41 +106,42 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
 
   if (use_qnn_attn_) {
     // Ensure QNN provider is available
-    // auto providers = Ort::GetAvailableProviders();
-    // bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
-    // if (!has_qnn) {
-    //   throw std::runtime_error("QNNExecutionProvider requested for vision attention but not available in this build");
-    // }
     auto so = OrtSessionOptions::Create();
 
+    so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
+    // QNN provider options
+    const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
+    const char* values[] = { qnn_backend_path_.c_str(), "burst", "3", "60" };
+
+    auto providers = Ort::GetAvailableProviders();
+    bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
+    if (has_qnn) {
+      so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
+    }
+    else {
+      // Use registered QNN EP
       size_t num_devices = 0;
       const OrtEpDevice* const* device_ptrs = nullptr;
       Ort::GetEpDevices(&GetOrtEnv(), &device_ptrs, &num_devices);
-
       std::vector<const OrtEpDevice*> ep_devices_ptrs;
       ep_devices_ptrs.reserve(num_devices);
-
       for (size_t i = 0; i < num_devices; ++i) {
         if (Ort::api->EpDevice_EpName(device_ptrs[i]) == std::string("QNNExecutionProvider")) {
           ep_devices_ptrs.push_back(device_ptrs[i]);
-          // std::cout << "added QNN EP for vision" << std::endl;
         }
       }
 
       if (ep_devices_ptrs.empty()) {
         throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered.");
+      } else {
+        Ort::api->SessionOptionsAppendExecutionProvider_V2(
+          so.get(),
+          &GetOrtEnv(),
+          ep_devices_ptrs.data(), ep_devices_ptrs.size(),
+          keys, values, 4
+        );
       }
-
-    so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
-    // QNN provider options
-    const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
-    const char* values[] = { qnn_backend_path_.c_str(), "burst", "3", "60" };
-
-    Ort::api->SessionOptionsAppendExecutionProvider_V2(
-      so.get(),
-      &GetOrtEnv(),
-      ep_devices_ptrs.data(), ep_devices_ptrs.size(),
-      keys, values, 4);
+    }
 
     vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), so.get());
   } else {

From e44b48b81a165c1e618e80e064a20aba09567509 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 14:07:05 -0800
Subject: [PATCH 14/25] Address comments

---
 examples/python/qwen2_5_vl_inference.py |  9 +++--
 src/models/model.cpp                    |  4 +++
 src/models/qwen_vl_model.cpp            |  8 +++++
 src/models/qwen_vl_vision.cpp           | 45 ++++++++++++++++---------
 src/models/qwen_vl_vision.h             |  7 ++++
 5 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
index 80af5dd584..ae627796f3 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -58,6 +58,12 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
     # 3. Return properly formatted inputs (pixel_values, image_grid_thw, input_ids)
     inputs = processor(prompt, images=images)
 
+    if "input_ids" in inputs:
+        input_ids_tensor = inputs["input_ids"]
+        input_length = input_ids_tensor.shape()[1]
+    else:
+        input_length = len(tokenizer.encode(prompt))
+
     # Setup generation parameters
     try:
         with open(config_dir / "genai_config.json", "r") as f:
@@ -69,8 +75,7 @@ def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_
         context_len = 2048
         eos_ids = []
     
-    # Use max_length from config if available, otherwise use context_length
-    max_length = min(context_len, 2048)  # Cap at 2048 for generation
+    max_length = min(input_length + max_new_tokens, context_len)
     
     params = og.GeneratorParams(model)
     params.set_search_options(max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p,
diff --git a/src/models/model.cpp b/src/models/model.cpp
index adc355f970..d5aea73568 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -236,6 +236,10 @@ State::~State() {
 std::vector<int32_t> PadInputs(std::span<std::span<const int32_t>> sequences, int32_t pad_token_id) {
   bool pad_right_{true};
 
+  if (pad_token_id == 0 && g_log.enabled) {
+    Log("warning", "pad_token_id is 0, which may be uninitialized. Verify genai_config.json contains 'pad_token_id' field.");
+  }
+
   size_t max_length = 0;
   for (auto& sequence : sequences)
     max_length = std::max(max_length, sequence.size());
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index 673a484171..8fcec48d07 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -136,6 +136,14 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
       image_embed_consumed_++;
     }
   }
+  
+  // Warn if there's a mismatch between image tokens and vision features
+  if (image_embed_consumed_ != static_cast<size_t>(num_vision_tokens)) {
+    Log("warning", "Vision embedding mismatch: consumed " + std::to_string(image_embed_consumed_) + 
+                   " of " + std::to_string(num_vision_tokens) + " available vision tokens. " +
+                   "This may indicate a mismatch between the number of image placeholders in the prompt " +
+                   "and the number of images provided.");
+  }
 }
 
 } // namespace Generators
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 8265c7998e..12ed7c39a1 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -61,7 +61,20 @@ std::vector<int64_t> Load1DNpyIndices(const std::string& file_path) {
   if (shape_str.empty()) throw std::runtime_error("Empty shape in npy header");
   if (shape_str.back() == ',') shape_str.pop_back();
   int64_t N = std::stoll(shape_str);
-  if (N <= 0) throw std::runtime_error("Invalid shape size in npy header");
+
+  // Validate array size to prevent OOM or malicious files
+  constexpr int64_t MAX_REASONABLE_SIZE = 100000000;  // 100M elements max
+  if (N <= 0 || N > MAX_REASONABLE_SIZE) {
+    throw std::runtime_error("Invalid or excessive array size in npy header: N=" + std::to_string(N) + 
+                           " (max allowed: " + std::to_string(MAX_REASONABLE_SIZE) + ")");
+  }
+
+  // Verify system is little-endian (matches npy file format expectation)
+  constexpr uint32_t endian_test = 0x01020304;
+  const bool is_little_endian = (*reinterpret_cast<const uint8_t*>(&endian_test) == 0x04);
+  if (!is_little_endian) {
+    throw std::runtime_error("System is not little-endian; cannot safely parse <i4/<i8 npy files");
+  }
 
   std::vector<int64_t> result;
   result.resize(static_cast<size_t>(N));
@@ -182,8 +195,8 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   const int64_t num_patches = pixel_shape[1];
   const int64_t hidden_dim = 1280;
   std::vector<int64_t> pe_out_shape{num_patches, hidden_dim};
-  std::vector<float> pe_out_buf(num_patches * hidden_dim);
-  auto pe_out_tensor = CreateTensor(pe_out_buf.data(), pe_out_buf.size(), pe_out_shape);
+  pe_out_buf_.resize(num_patches * hidden_dim);
+  auto pe_out_tensor = CreateTensor(pe_out_buf_.data(), pe_out_buf_.size(), pe_out_shape);
   
   auto pe_out_name = patch_embed_session_->GetOutputName(0);
   const char* pe_output_names[] = { pe_out_name.c_str() };
@@ -199,38 +212,38 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
     throw std::runtime_error("Invalid window configuration for vision pipeline");
   }
 
-  std::vector<float> reordered(seq_len * hidden_dim);
+  reordered_buf_.resize(seq_len * hidden_dim);
   for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
     int64_t src_w = wnd_idx_[dst_w];
     if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
     size_t offset_size = window_area * hidden_dim;
-    std::memcpy(reordered.data() + dst_w * offset_size, 
-                pe_out_buf.data() + src_w * offset_size,
+    std::memcpy(reordered_buf_.data() + dst_w * offset_size, 
+                pe_out_buf_.data() + src_w * offset_size,
                 offset_size * sizeof(float));
   }
 
   std::vector<int64_t> attn_shape{seq_len, hidden_dim};
-  auto attn_in_tensor = CreateTensor(reordered.data(), reordered.size(), attn_shape);
+  auto attn_in_tensor = CreateTensor(reordered_buf_.data(), reordered_buf_.size(), attn_shape);
   const char* attn_input_names[] = {"hidden"};
   OrtValue* attn_inputs[] = { attn_in_tensor.get() };
 
-  std::vector<float> attn_out_buf(seq_len * hidden_dim);
-  auto attn_out_tensor = CreateTensor(attn_out_buf.data(), attn_out_buf.size(), attn_shape);
+  attn_out_buf_.resize(seq_len * hidden_dim);
+  auto attn_out_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
   auto attn_out_name = vision_attn_session_->GetOutputName(0);
   const char* attn_output_names[] = { attn_out_name.c_str() };
   OrtValue* attn_outputs[] = { attn_out_tensor.get() };
   
   vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1);
 
-  auto merger_in_tensor = CreateTensor(attn_out_buf.data(), attn_out_buf.size(), attn_shape);
+  auto merger_in_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
   const char* merger_input_names[] = {"hidden"};
   OrtValue* merger_inputs[] = { merger_in_tensor.get() };
   
   const int64_t merged_seq_len = num_windows;  // One token per window after merging
   const int64_t merged_hidden = 3584;
   std::vector<int64_t> merger_shape{merged_seq_len, merged_hidden};
-  std::vector<float> merger_out_buf(merged_seq_len * merged_hidden);
-  auto merger_out_tensor = CreateTensor(merger_out_buf.data(), merger_out_buf.size(), merger_shape);
+  merger_out_buf_.resize(merged_seq_len * merged_hidden);
+  auto merger_out_tensor = CreateTensor(merger_out_buf_.data(), merger_out_buf_.size(), merger_shape);
   auto merger_out_name = patch_merger_session_->GetOutputName(0);
   const char* merger_output_names[] = { merger_out_name.c_str() };
   OrtValue* merger_outputs[] = { merger_out_tensor.get() };
@@ -241,16 +254,16 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
     throw std::runtime_error("Vision pipeline reverse index size mismatch");
   }
 
-  std::vector<float> final_embeddings(merger_out_buf.size());
+  final_embeddings_buf_.resize(merger_out_buf_.size());
   for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
-    std::memcpy(final_embeddings.data() + dst_w * merged_hidden,
-                merger_out_buf.data() + rev_idx_[dst_w] * merged_hidden,
+    std::memcpy(final_embeddings_buf_.data() + dst_w * merged_hidden,
+                merger_out_buf_.data() + rev_idx_[dst_w] * merged_hidden,
                 merged_hidden * sizeof(float));
   }
 
   last_seq_len_ = merged_seq_len;
   last_hidden_size_ = merged_hidden;
-  return final_embeddings;
+  return final_embeddings_buf_;
 }
 
 } // namespace Generators
diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h
index 64d2171096..39ee998f05 100644
--- a/src/models/qwen_vl_vision.h
+++ b/src/models/qwen_vl_vision.h
@@ -73,6 +73,13 @@ struct QwenVisionPipeline {
   OrtEnv& env_;
   int64_t last_seq_len_{0};
   int64_t last_hidden_size_{0};
+
+  // Reusable buffers to avoid repeated allocation/deallocation
+  mutable std::vector<float> pe_out_buf_;
+  mutable std::vector<float> reordered_buf_;
+  mutable std::vector<float> attn_out_buf_;
+  mutable std::vector<float> merger_out_buf_;
+  mutable std::vector<float> final_embeddings_buf_;
 };
 
 } // namespace Generators

From 1ec76d5d6018eaeb5856a55c757d9de8eb2637fc Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 14:29:51 -0800
Subject: [PATCH 15/25] Comments addressed

---
 examples/python/qwen2_5_vl_inference.py | 1 -
 src/config.cpp                          | 4 ++--
 src/config.h                            | 1 +
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
index ae627796f3..88e807ce1f 100644
--- a/examples/python/qwen2_5_vl_inference.py
+++ b/examples/python/qwen2_5_vl_inference.py
@@ -5,7 +5,6 @@
 import argparse
 import json
 import sys
-import numpy as np
 from pathlib import Path
 
 import onnxruntime_genai as og
diff --git a/src/config.cpp b/src/config.cpp
index 28bbca39f8..8d749b14b3 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -621,8 +621,8 @@ struct VisionInputs_Element : JSON::Element {
       v_.pixel_values = JSON::Get<std::string_view>(value);
     } else if (name == "image_sizes") {
       v_.image_sizes = JSON::Get<std::string_view>(value);
-    } else if (name == "image_grid_thw") { // accept alternate naming, map to image_sizes
-      v_.image_sizes = JSON::Get<std::string_view>(value);
+    } else if (name == "image_grid_thw") {
+      v_.image_grid_thw = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
     } else {
diff --git a/src/config.h b/src/config.h
index 4163156eb5..350853d2a2 100644
--- a/src/config.h
+++ b/src/config.h
@@ -180,6 +180,7 @@ struct Config {
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
+        std::string image_grid_thw{Defaults::ImageSizesName};  // Qwen2.5-VL uses image_grid_thw, defaults to image_sizes
         std::string attention_mask{Defaults::ImageAttentionMaskName};  // image attention mask
       } inputs;
 

From 6d93d02818f3ffadc5e19d40e04f378c02b34a67 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 14:30:15 -0800
Subject: [PATCH 16/25] Fix Mac pipeline

---
 test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
index 6cf90b0245..2a7b891028 100644
--- a/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
+++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
@@ -75,7 +75,7 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Visible>false</Visible>
     </None>
-    <None Condition="Exists('$(OrtLibDir)\libonnxruntime.dylib')" Include="$(OrtLibDir)\libonnxruntime.dylib">
+    <None Condition="Exists('$(NativeBuildOutputDir)\libonnxruntime.dylib')" Include="$(NativeBuildOutputDir)\libonnxruntime.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Visible>false</Visible>
     </None>

From f8fe87d166fef3f04cdd7a15a9c7c585dc5860ab Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 17:39:06 -0800
Subject: [PATCH 17/25] Remove window indexing npy and add dynamic window
 indexing

---
 src/config.cpp                |  23 ----
 src/config.h                  |   6 -
 src/models/qwen_vl_model.cpp  |  43 ++++++-
 src/models/qwen_vl_vision.cpp | 226 +++++++++++++++++-----------------
 src/models/qwen_vl_vision.h   |  36 ++----
 5 files changed, 166 insertions(+), 168 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index 8d749b14b3..c3e5e28c69 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -720,23 +720,6 @@ struct VisionPipeline_Element : JSON::Element {
   VisionPipelineModelObject_Element object_{v_};
 };
 
-struct WindowIndexing_Element : JSON::Element {
-  explicit WindowIndexing_Element(Config::Model::Vision::WindowIndexing& v) : v_{v} {}
-
-  void OnValue(std::string_view name, JSON::Value value) override {
-    if (name == "filename") {
-      v_.filename = JSON::Get<std::string_view>(value);
-    } else if (name == "spatial_merge_size") {
-      v_.spatial_merge_size = static_cast<int>(JSON::Get<double>(value));
-    } else {
-      throw JSON::unknown_value_error{};
-    }
-  }
-
- private:
-  Config::Model::Vision::WindowIndexing& v_;
-};
-
 struct Vision_Element : JSON::Element {
   explicit Vision_Element(Config::Model::Vision& v) : v_{v} {}
 
@@ -769,11 +752,6 @@ struct Vision_Element : JSON::Element {
     if (name == "outputs") {
       return outputs_;
     }
-    if (name == "window_indexing") {
-      v_.window_indexing = Config::Model::Vision::WindowIndexing{};
-      window_indexing_element_ = std::make_unique<WindowIndexing_Element>(*v_.window_indexing);
-      return *window_indexing_element_;
-    }
     // Support object-style pipeline for vision: "pipeline": { "patch_embed": { ... }, ... }
     if (name == "pipeline") {
       vision_pipeline_object_ = std::make_unique<VisionPipelineModelObject_Element>(v_.pipeline);
@@ -795,7 +773,6 @@ struct Vision_Element : JSON::Element {
   std::unique_ptr<RunOptions_Element> run_options_;
   VisionInputs_Element inputs_{v_.inputs};
   VisionOutputs_Element outputs_{v_.outputs};
-  std::unique_ptr<WindowIndexing_Element> window_indexing_element_;
   VisionPipeline_Element pipeline_element_{v_.pipeline};
   std::unique_ptr<VisionPipelineModelObject_Element> vision_pipeline_object_; // object-style pipeline support
 };
diff --git a/src/config.h b/src/config.h
index 350853d2a2..52b7584314 100644
--- a/src/config.h
+++ b/src/config.h
@@ -171,12 +171,6 @@ struct Config {
       };
       std::vector<PipelineModel> pipeline;  // Ordered pipeline models
 
-      struct WindowIndexing {
-        std::string filename;       // Path to wnd_idx.npy
-        int spatial_merge_size{};   // Spatial merge size used for window expansion
-      };
-      std::optional<WindowIndexing> window_indexing; // Optional window indexing configuration
-
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index 8fcec48d07..c9fb1237d7 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -9,7 +9,7 @@ namespace Generators {
 
 Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
   : DecoderOnlyPipelineModel(std::move(config), ort_env) {  
-  if (config_->model.vision.pipeline.empty() || !config_->model.vision.window_indexing.has_value()) return;
+  if (config_->model.vision.pipeline.empty()) return;
 
   // Find vision pipeline stage paths
   auto find_stage = [&](const std::string& id) -> std::string {
@@ -34,12 +34,12 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
     }
   }
 
-  auto wnd_idx_path = (config_->config_path / fs::path(config_->model.vision.window_indexing->filename)).string();
-  int spatial_merge = config_->model.vision.window_indexing->spatial_merge_size;
+  // Default spatial merge size
+  constexpr int spatial_merge = 2;
   
   vision_pipeline_ = std::make_unique<QwenVisionPipeline>(
     ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
-    spatial_merge, wnd_idx_path, use_qnn_attn);
+    spatial_merge, use_qnn_attn);
 }
 
 std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
@@ -59,12 +59,15 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   if (vision_ran_ || !vl_model_.vision_pipeline_) return;
 
   OrtValue* pixel_values_val = nullptr;
+  OrtValue* image_grid_thw_val = nullptr;
   const auto& pixel_name = vl_model_.config_->model.vision.inputs.pixel_values;
+  const auto& grid_thw_name = vl_model_.config_->model.vision.inputs.image_grid_thw;
   
   for (const auto& input : extra_inputs) {
     if (input.name == pixel_name) {
       pixel_values_val = input.tensor->GetOrtTensor();
-      break;
+    } else if (input.name == grid_thw_name) {
+      image_grid_thw_val = input.tensor->GetOrtTensor();
     }
   }
   if (!pixel_values_val) return;
@@ -74,8 +77,36 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   const float* pixel_data = pixel_values_val->GetTensorMutableData<float>();
   if (!pixel_data) return;
 
+  // Extract grid_thw if provided
+  std::vector<int64_t> grid_thw;
+  if (image_grid_thw_val) {
+    auto grid_shape = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetShape();
+    auto element_type = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetElementType();
+    
+    if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+      const int64_t* grid_data = image_grid_thw_val->GetTensorData<int64_t>();
+      size_t grid_count = 1;
+      for (auto dim : grid_shape) grid_count *= dim;
+      
+      // Expect [batch, 3] or [3] shape - take last 3 values as [t, h, w]
+      if (grid_count >= 3) {
+        grid_thw = {grid_data[grid_count - 3], grid_data[grid_count - 2], grid_data[grid_count - 1]};
+      }
+    } else if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+      const int32_t* grid_data = image_grid_thw_val->GetTensorData<int32_t>();
+      size_t grid_count = 1;
+      for (auto dim : grid_shape) grid_count *= dim;
+      
+      if (grid_count >= 3) {
+        grid_thw = {static_cast<int64_t>(grid_data[grid_count - 3]), 
+                    static_cast<int64_t>(grid_data[grid_count - 2]), 
+                    static_cast<int64_t>(grid_data[grid_count - 1])};
+      }
+    }
+  }
+
   try {
-    image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec);
+    image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec, grid_thw);
   } catch (const std::exception&) {
     return;  // Silent failure - pipeline already logs errors
   }
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 12ed7c39a1..730dd7c393 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -13,96 +13,21 @@
 
 namespace Generators {
 
-// Minimal .npy reader for 1D integer arrays.
-// Only handles C-order, little-endian, shape (N,), for dtypes '<i4' or '<i8'.
-std::vector<int64_t> Load1DNpyIndices(const std::string& file_path) {
-  std::ifstream fin(file_path, std::ios::binary);
-  if (!fin) throw std::runtime_error("Failed to open npy file: " + file_path);
-
-  // Read magic string
-  char magic[6];
-  fin.read(magic, 6);
-  if (std::strncmp(magic, "\x93NUMPY", 6) != 0) {
-    throw std::runtime_error("Invalid npy header (magic mismatch) for: " + file_path);
-  }
-  // Version
-  unsigned char ver_major; unsigned char ver_minor;
-  fin.read(reinterpret_cast<char*>(&ver_major), 1);
-  fin.read(reinterpret_cast<char*>(&ver_minor), 1);
-  uint16_t header_len_le;
-  fin.read(reinterpret_cast<char*>(&header_len_le), 2); // little endian
-  const uint16_t header_len = header_len_le;
-  std::string header(header_len, '\0');
-  fin.read(header.data(), header_len);
-
-  auto find_field = [&](const std::string& key) {
-    auto pos = header.find(key);
-    if (pos == std::string::npos) return std::string();
-    return header.substr(pos, header.size() - pos);
-  };
-
-  // dtype
-  auto descr_pos = header.find("'descr':");
-  if (descr_pos == std::string::npos) throw std::runtime_error("Missing 'descr' in npy header");
-  auto descr_start = header.find("'", descr_pos + 8);
-  auto descr_end = header.find("'", descr_start + 1);
-  std::string dtype = header.substr(descr_start + 1, descr_end - descr_start - 1);
-  bool is_int32 = (dtype == "<i4");
-  bool is_int64 = (dtype == "<i8");
-  if (!is_int32 && !is_int64) throw std::runtime_error("Unsupported dtype in npy (expected <i4 or <i8): " + dtype);
-
-  auto shape_pos = header.find("'shape':");
-  if (shape_pos == std::string::npos) throw std::runtime_error("Missing 'shape' in npy header");
-  auto paren_start = header.find("(", shape_pos);
-  auto paren_end = header.find(")", paren_start);
-  std::string shape_str = header.substr(paren_start + 1, paren_end - paren_start - 1);
-  // shape like "1234," or "1234" depending on version
-  shape_str.erase(std::remove(shape_str.begin(), shape_str.end(), ' '), shape_str.end());
-  if (shape_str.empty()) throw std::runtime_error("Empty shape in npy header");
-  if (shape_str.back() == ',') shape_str.pop_back();
-  int64_t N = std::stoll(shape_str);
-
-  // Validate array size to prevent OOM or malicious files
-  constexpr int64_t MAX_REASONABLE_SIZE = 100000000;  // 100M elements max
-  if (N <= 0 || N > MAX_REASONABLE_SIZE) {
-    throw std::runtime_error("Invalid or excessive array size in npy header: N=" + std::to_string(N) + 
-                           " (max allowed: " + std::to_string(MAX_REASONABLE_SIZE) + ")");
-  }
-
-  // Verify system is little-endian (matches npy file format expectation)
-  constexpr uint32_t endian_test = 0x01020304;
-  const bool is_little_endian = (*reinterpret_cast<const uint8_t*>(&endian_test) == 0x04);
-  if (!is_little_endian) {
-    throw std::runtime_error("System is not little-endian; cannot safely parse <i4/<i8 npy files");
-  }
-
-  std::vector<int64_t> result;
-  result.resize(static_cast<size_t>(N));
-
-  if (is_int32) {
-    std::vector<int32_t> tmp(N);
-    fin.read(reinterpret_cast<char*>(tmp.data()), N * sizeof(int32_t));
-    if (fin.gcount() != static_cast<std::streamsize>(N * sizeof(int32_t))) throw std::runtime_error("Unexpected EOF reading npy data");
-    for (int64_t i = 0; i < N; ++i) result[static_cast<size_t>(i)] = static_cast<int64_t>(tmp[static_cast<size_t>(i)]);
-  } else {
-    fin.read(reinterpret_cast<char*>(result.data()), N * sizeof(int64_t));
-    if (fin.gcount() != static_cast<std::streamsize>(N * sizeof(int64_t))) throw std::runtime_error("Unexpected EOF reading npy data");
-  }
-  return result;
-}
-
 QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
                                        const std::string& patch_embed_model,
                                        const std::string& vision_attn_model,
                                        const std::string& patch_merger_model,
                                        int64_t spatial_merge_size,
-                                       const std::string& wnd_idx_path,
                                        bool use_qnn_attn,
-                                       const std::string& qnn_backend_path)
+                                       const std::string& qnn_backend_path,
+                                       int64_t patch_size,
+                                       int64_t window_size)
   // Match declaration order to avoid MSVC C5038 warning-as-error
   : use_qnn_attn_(use_qnn_attn),
     qnn_backend_path_(qnn_backend_path),
     spatial_merge_size_(spatial_merge_size),
+    patch_size_(patch_size),
+    window_size_(window_size),
     env_(env) {
 
   // Convert std::string model paths to ORTCHAR_T for cross-platform (char or wchar_t)
@@ -160,15 +85,6 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
   } else {
     vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), nullptr);
   }
-
-  wnd_idx_ = Load1DNpyIndices(wnd_idx_path);
-  // Build reverse index (argsort)
-  rev_idx_.resize(wnd_idx_.size());
-  std::vector<std::pair<int64_t, size_t>> pairs;
-  pairs.reserve(wnd_idx_.size());
-  for (size_t i = 0; i < wnd_idx_.size(); ++i) pairs.emplace_back(wnd_idx_[i], i);
-  std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b){ return a.first < b.first; });
-  for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
 }
 
 std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
@@ -180,11 +96,25 @@ std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, si
 
 // Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store).
 
-std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape) {
+std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape, 
+                                           const std::vector<int64_t>& grid_thw) {
   if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
     throw std::runtime_error("Vision pipeline sessions not initialized");
   }
   
+  // Calculate window indices dynamically if grid_thw provided
+  if (!grid_thw.empty() && grid_thw.size() == 3) {
+    wnd_idx_ = CalculateWindowIndex(grid_thw[0], grid_thw[1], grid_thw[2]);
+    
+    // Build reverse index (argsort)
+    rev_idx_.resize(wnd_idx_.size());
+    std::vector<std::pair<int64_t, size_t>> pairs;
+    pairs.reserve(wnd_idx_.size());
+    for (size_t i = 0; i < wnd_idx_.size(); ++i) pairs.emplace_back(wnd_idx_[i], i);
+    std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b){ return a.first < b.first; });
+    for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
+  }
+  
   size_t pixel_count = 1;
   for (auto d : pixel_shape) pixel_count *= static_cast<size_t>(d);
   auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape);
@@ -208,18 +138,27 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   const int64_t window_area = spatial_merge_size_ * spatial_merge_size_;
   const int64_t num_windows = seq_len / window_area;
   
-  if (seq_len % window_area != 0 || static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
-    throw std::runtime_error("Invalid window configuration for vision pipeline");
-  }
-
+  // Apply window reordering if indices available
   reordered_buf_.resize(seq_len * hidden_dim);
-  for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
-    int64_t src_w = wnd_idx_[dst_w];
-    if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
-    size_t offset_size = window_area * hidden_dim;
-    std::memcpy(reordered_buf_.data() + dst_w * offset_size, 
-                pe_out_buf_.data() + src_w * offset_size,
-                offset_size * sizeof(float));
+  
+  if (!wnd_idx_.empty()) {
+    // Validate window configuration
+    if (seq_len % window_area != 0 || static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
+      throw std::runtime_error("Invalid window configuration for vision pipeline");
+    }
+    
+    // Apply window reordering
+    for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
+      int64_t src_w = wnd_idx_[dst_w];
+      if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
+      size_t offset_size = window_area * hidden_dim;
+      std::memcpy(reordered_buf_.data() + dst_w * offset_size, 
+                  pe_out_buf_.data() + src_w * offset_size,
+                  offset_size * sizeof(float));
+    }
+  } else {
+    // No window reordering - use sequential order
+    std::memcpy(reordered_buf_.data(), pe_out_buf_.data(), seq_len * hidden_dim * sizeof(float));
   }
 
   std::vector<int64_t> attn_shape{seq_len, hidden_dim};
@@ -239,7 +178,7 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   const char* merger_input_names[] = {"hidden"};
   OrtValue* merger_inputs[] = { merger_in_tensor.get() };
   
-  const int64_t merged_seq_len = num_windows;  // One token per window after merging
+  const int64_t merged_seq_len = seq_len / window_area;  // One token per window after merging
   const int64_t merged_hidden = 3584;
   std::vector<int64_t> merger_shape{merged_seq_len, merged_hidden};
   merger_out_buf_.resize(merged_seq_len * merged_hidden);
@@ -250,15 +189,22 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   
   patch_merger_session_->Run(nullptr, merger_input_names, merger_inputs, 1, merger_output_names, merger_outputs, 1);
 
-  if (static_cast<int64_t>(rev_idx_.size()) != num_windows) {
-    throw std::runtime_error("Vision pipeline reverse index size mismatch");
-  }
-
   final_embeddings_buf_.resize(merger_out_buf_.size());
-  for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
-    std::memcpy(final_embeddings_buf_.data() + dst_w * merged_hidden,
-                merger_out_buf_.data() + rev_idx_[dst_w] * merged_hidden,
-                merged_hidden * sizeof(float));
+  
+  if (!rev_idx_.empty()) {
+    // Apply reverse reordering
+    if (static_cast<int64_t>(rev_idx_.size()) != num_windows) {
+      throw std::runtime_error("Vision pipeline reverse index size mismatch");
+    }
+    for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
+      std::memcpy(final_embeddings_buf_.data() + dst_w * merged_hidden,
+                  merger_out_buf_.data() + rev_idx_[dst_w] * merged_hidden,
+                  merged_hidden * sizeof(float));
+    }
+  } else {
+    // No reverse reordering - use sequential order
+    std::memcpy(final_embeddings_buf_.data(), merger_out_buf_.data(), 
+                merger_out_buf_.size() * sizeof(float));
   }
 
   last_seq_len_ = merged_seq_len;
@@ -266,4 +212,64 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   return final_embeddings_buf_;
 }
 
+// Calculate window indices dynamically based on grid dimensions
+// Matches HuggingFace transformers implementation:
+// https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L367
+std::vector<int64_t> QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w) {
+  // Calculate LLM grid dimensions after spatial merging
+  int64_t llm_grid_h = grid_h / spatial_merge_size_;
+  int64_t llm_grid_w = grid_w / spatial_merge_size_;
+  
+  // Calculate window size at the merged resolution
+  int64_t vit_merger_window_size = window_size_ / spatial_merge_size_ / patch_size_;
+  
+  // Calculate padding needed to fit into windows
+  int64_t pad_h = (vit_merger_window_size - (llm_grid_h % vit_merger_window_size)) % vit_merger_window_size;
+  int64_t pad_w = (vit_merger_window_size - (llm_grid_w % vit_merger_window_size)) % vit_merger_window_size;
+  
+  int64_t num_windows_h = (llm_grid_h + pad_h) / vit_merger_window_size;
+  int64_t num_windows_w = (llm_grid_w + pad_w) / vit_merger_window_size;
+  
+  std::vector<int64_t> window_index;
+  window_index.reserve(grid_t * llm_grid_h * llm_grid_w);
+  
+  // Create initial index grid
+  std::vector<int64_t> index(grid_t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w), -100);
+  
+  // Fill non-padded positions with sequential indices
+  for (int64_t t = 0; t < grid_t; ++t) {
+    for (int64_t h = 0; h < llm_grid_h; ++h) {
+      for (int64_t w = 0; w < llm_grid_w; ++w) {
+        int64_t idx = t * llm_grid_h * llm_grid_w + h * llm_grid_w + w;
+        int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w;
+        index[padded_idx] = idx;
+      }
+    }
+  }
+  
+  // Reshape into windows: (grid_t, num_windows_h, window_size, num_windows_w, window_size)
+  // Then permute to (grid_t, num_windows_h, num_windows_w, window_size, window_size)
+  // This groups patches by window instead of by spatial position
+  for (int64_t t = 0; t < grid_t; ++t) {
+    for (int64_t wh = 0; wh < num_windows_h; ++wh) {
+      for (int64_t ww = 0; ww < num_windows_w; ++ww) {
+        for (int64_t ph = 0; ph < vit_merger_window_size; ++ph) {
+          for (int64_t pw = 0; pw < vit_merger_window_size; ++pw) {
+            int64_t h = wh * vit_merger_window_size + ph;
+            int64_t w = ww * vit_merger_window_size + pw;
+            int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w;
+            
+            // Only add non-padded indices
+            if (index[padded_idx] != -100) {
+              window_index.push_back(index[padded_idx]);
+            }
+          }
+        }
+      }
+    }
+  }
+  
+  return window_index;
+}
+
 } // namespace Generators
diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h
index 39ee998f05..9f9a739e25 100644
--- a/src/models/qwen_vl_vision.h
+++ b/src/models/qwen_vl_vision.h
@@ -1,16 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-//
-// Qwen VL Vision pipeline support (initial skeleton).
-// Executes three ONNX models in sequence:
-//   1) Patch Embedding  : pixel_values -> hidden
-//   2) Vision Attention : hidden -> hidden
-//   3) Patch Merger      : hidden -> merged embeddings
-// Performs window expansion/reordering using wnd_idx, then final reverse ordering.
-//
-// This is a minimal starting point to integrate Qwen2.5-VL vision processing
-// into onnxruntime-genai. Further work will: (a) connect to Config parsing,
-// (b) expose via MultiModal pipeline, (c) add EP selection, (d) reuse buffers.
 
 #pragma once
 
@@ -23,10 +12,6 @@
 
 namespace Generators {
 
-// Simple loader for a 1D numpy .npy file containing integer indices.
-// Supports little-endian int32/int64 arrays of shape (N,).
-std::vector<int64_t> Load1DNpyIndices(const std::string& file_path);
-
 // Internal vision pipeline (no external DLL interface required after Python binding removal).
 struct QwenVisionPipeline {
   QwenVisionPipeline(OrtEnv& env,
@@ -34,9 +19,10 @@ struct QwenVisionPipeline {
                      const std::string& vision_attn_model,
                      const std::string& patch_merger_model,
                      int64_t spatial_merge_size,
-                     const std::string& wnd_idx_path,
                      bool use_qnn_attn = false,
-                     const std::string& qnn_backend_path = "QnnHtp.dll");
+                     const std::string& qnn_backend_path = "QnnHtp.dll",
+                     int64_t patch_size = 14,
+                     int64_t window_size = 56);
   bool use_qnn_attn_{};
   std::string qnn_backend_path_{};
 
@@ -45,9 +31,11 @@ struct QwenVisionPipeline {
 
   // Run vision pipeline.
   // pixel_values: float32 tensor with shape [S, C] or [B, C, H, W] depending on export (caller provides shape).
+  // grid_thw: optional grid dimensions [temporal, height, width] for dynamic window indexing
   // The ONNX model is assumed to accept the provided shape directly as 'pixel_values'.
   // Returns final merged embeddings (shape: [num_image_tokens, hidden_size]).
-  std::vector<float> Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape);
+  std::vector<float> Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape, 
+                        const std::vector<int64_t>& grid_thw = {});
 
   // Shape info from last Run (seq_len, hidden_size). Returns empty vector if Run not called yet.
   std::vector<int64_t> GetLastOutputShape() const {
@@ -55,21 +43,23 @@ struct QwenVisionPipeline {
     return {last_seq_len_, last_hidden_size_};
   }
 
-  // Accessors
-  const std::vector<int64_t>& GetWndIdx() const { return wnd_idx_; }
-  int64_t GetSpatialMergeSize() const { return spatial_merge_size_; }
-
  private:
   // Internal helpers
   std::unique_ptr<OrtValue> CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const;
+  
+  // Calculate window indices dynamically based on grid dimensions
+  // Returns window_index (reordering indices for windowing)
+  std::vector<int64_t> CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w);
 
   std::unique_ptr<OrtSession> patch_embed_session_;
   std::unique_ptr<OrtSession> vision_attn_session_;
   std::unique_ptr<OrtSession> patch_merger_session_;
 
-  std::vector<int64_t> wnd_idx_;  // window reordering indices
+  std::vector<int64_t> wnd_idx_;  // window reordering indices (computed dynamically)
   std::vector<int64_t> rev_idx_;  // reverse ordering indices (argsort of wnd_idx)
   int64_t spatial_merge_size_{};
+  int64_t patch_size_{14};        // Vision patch size (typically 14)
+  int64_t window_size_{56};       // Window size for attention (typically 56)
   OrtEnv& env_;
   int64_t last_seq_len_{0};
   int64_t last_hidden_size_{0};

From 304767c0b798ee0db425dbb355ad3b7bdf1e8924 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 13:54:18 -0800
Subject: [PATCH 18/25] Address comments

---
 examples/python/model-vision.py               |  37 ++++-
 examples/python/qwen2_5_vl_inference.py       | 150 ------------------
 src/config.cpp                                |   2 -
 src/config.h                                  |   4 +-
 src/models/model.cpp                          |   4 -
 src/models/qwen2_5_vl_image_processor.cpp     |   2 +-
 src/models/qwen_vl_model.cpp                  |  82 ++++++++--
 ...Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj |   2 +-
 8 files changed, 112 insertions(+), 171 deletions(-)
 delete mode 100644 examples/python/qwen2_5_vl_inference.py

diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
index 1fd73a9db2..ee2fdfd2d1 100644
--- a/examples/python/model-vision.py
+++ b/examples/python/model-vision.py
@@ -1,6 +1,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License
 
+import winml
+print(winml.register_execution_providers(ort=False, ort_genai=True))
+
 import argparse
 import glob
 import json
@@ -12,6 +15,20 @@
 
 # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
 
+# Tool-calling system prompt for Qwen/Fara models
+TOOL_CALL_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
+
+The functions at your disposal are:
+<tools>
+{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer based on screenshots.\\n- This is an interface to a web browser. You do not have access to a terminal or applications menu, only the browser.\\n- Some pages, etc. may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click a home page icon and a window doesn't change, try wait and taking another screenshot.\\n- Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n- If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n- Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\\n- When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\\nScreen resolution: 1428x896", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\\n* `key`: Press keyboard keys, like \\"Enter\\", \\"Alt\\", \\"Shift\\", \\"Tab\\", \\"Control\\", \\"Backspace\\", \\"Delete\\", \\"Escape\\", etc. Keys are pressed down in the order given, then released in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `visit_url`: Visit a specified URL.\\n* `web_search`: Perform a web search with a specified query.\\n* `history_back`: Go back to the previous page in the browser history.\\n* `pause_and_memorize_fact`: Pause and memorize a fact for future reference.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate"], "type": "string"}, "keys": {"description": "Keyboard keys to be pressed in order. Required only by `action=key`.", "type": "array"}, "text": {"description": "Text to type. Required only by `action=type`.", "type": "string"}, "press_enter": {"description": "Whether to press the 'Enter' key after typing. Required only by `action=type`.", "type": "boolean"}, "delete_existing_text": {"description": "Whether to delete existing text before typing. Required only by `action=type`.", "type": "boolean"}, "coordinate": {"description": "[x, y]: The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}, "url": {"description": "The URL to visit. Required only by `action=visit_url`.", "type": "string"}, "query": {"description": "The query to search for. Required only by `action=web_search`.", "type": "string"}, "fact": {"description": "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", "type": "string"}, "time": {"description": "Number of seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}}
+</tools>
+
+To make a function call, you should output a json object inside <tool_call></tool_call> XML tags. The json object must contain the function name and its arguments, like this:
+<tool_call>
+{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}
+</tool_call>
+"""
+
 
 def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
     curr_path = Path(current_dir).absolute()
@@ -101,6 +118,10 @@ def run(args: argparse.Namespace):
             # Combine all image tags and text into one user message
             content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text
             messages.append({"role": "user", "content": content})
+        elif model.type in ["qwen2_5_vl", "fara"]:
+            messages.append({"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT})
+            content = "".join([f"<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text
+            messages.append({"role": "user", "content": content})
         else:
             # Gemma3-style multimodal: structured content
             content_list = [{"type": "image"} for _ in image_paths]
@@ -116,7 +137,17 @@ def run(args: argparse.Namespace):
 
         print("Generating response...")
         params = og.GeneratorParams(model)
-        params.set_search_options(max_length=7680)
+        if args.max_length:
+            max_length = args.max_length
+        else:
+            try:
+                config_path = Path(args.model_path) / "genai_config.json"
+                with open(config_path, "r") as f:
+                    config = json.load(f)
+                    max_length = config.get("model", {}).get("context_length", 7680)
+            except Exception:
+                max_length = 7680
+        params.set_search_options(max_length=max_length)
 
         generator = og.Generator(model, params)
         generator.set_inputs(inputs)
@@ -162,6 +193,10 @@ def run(args: argparse.Namespace):
     parser.add_argument(
         "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
     )
+    parser.add_argument(
+        "--max_length", type=int, required=False, default=None,
+        help="Maximum generation length. Defaults to model's context_length from config."
+    )
     parser.add_argument(
         "--non-interactive",
         action=argparse.BooleanOptionalAction,
diff --git a/examples/python/qwen2_5_vl_inference.py b/examples/python/qwen2_5_vl_inference.py
deleted file mode 100644
index 88e807ce1f..0000000000
--- a/examples/python/qwen2_5_vl_inference.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import winml
-print(winml.register_execution_providers(ort=False, ort_genai=True))
-
-
-import argparse
-import json
-import sys
-from pathlib import Path
-
-import onnxruntime_genai as og
-
-TOOL_CALL_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
-
-The functions at your disposal are:
-<tools>
-{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer based on screenshots.\n- This is an interface to a web browser. You do not have access to a terminal or applications menu, only the browser.\n- Some pages, etc. may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click a home page icon and a window doesn't change, try wait and taking another screenshot.\n- Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n- If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n- Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\n- When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\nScreen resolution: 1428x896", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* `key`: Press keyboard keys, like \"Enter\", \"Alt\", \"Shift\", \"Tab\", \"Control\", \"Backspace\", \"Delete\", \"Escape\", etc. Keys are pressed down in the order given, then released in reverse order.\n* `type`: Type a string of text on the keyboard.\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* `left_click`: Click the left mouse button.\n* `scroll`: Performs a scroll of the mouse scroll wheel.\n* `visit_url`: Visit a specified URL.\n* `web_search`: Perform a web search with a specified query.\n* `history_back`: Go back to the previous page in the browser history.\n* `pause_and_memorize_fact`: Pause and memorize a fact for future reference.\n* `wait`: Wait specified seconds for the change to happen.\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate"], "type": "string"}, "keys": {"description": "Keyboard keys to be pressed in order. Required only by `action=key`.", "type": "array"}, "text": {"description": "Text to type. Required only by `action=type`.", "type": "string"}, "press_enter": {"description": "Whether to press the 'Enter' key after typing. Required only by `action=type`.", "type": "boolean"}, "delete_existing_text": {"description": "Whether to delete existing text before typing. Required only by `action=type`.", "type": "boolean"}, "coordinate": {"description": "[x, y]: The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}, "url": {"description": "The URL to visit. Required only by `action=visit_url`.", "type": "string"}, "query": {"description": "The query to search for. Required only by `action=web_search`.", "type": "string"}, "fact": {"description": "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", "type": "string"}, "time": {"description": "Number of seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}}
-</tools>
-
-To make a function call, you should output a json object inside <tool_call></tool_call> XML tags. The json object must contain the function name and its arguments, like this:
-<tool_call>
-{\"name\": <function-name>, \"arguments\": <args-json-object>}
-</tool_call>
-"""
-
-def run_inference(config_dir: Path, image_path: Path, prompt_text: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float,
-                  do_sample: bool = False, min_length: int = 0, repetition_penalty: float = 1.0):
-    if not config_dir.is_dir():
-        raise FileNotFoundError(f"Config directory not found: {config_dir}")
-    if not image_path.is_file():
-        raise FileNotFoundError(f"Image file not found: {image_path}")
-
-    # Load model and create multimodal processor (uses C++ Qwen2_5VLImageProcessor)
-    model = og.Model(str(config_dir))
-    
-    tokenizer = og.Tokenizer(model)
-    
-    processor = model.create_multimodal_processor()
-    tokenizer_stream = processor.create_stream()
-    
-    # Load image using GenAI's image loader (internally uses onnxruntime-extensions)
-    images = og.Images.open(str(image_path))
-    
-    # Build conversation with prompt
-    conversation = [
-        {"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT},
-        {"role": "user", "content": prompt_text},
-    ]
-    
-    # Apply chat template to format the conversation
-    message_json = json.dumps(conversation)
-    prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
-    
-    # Process prompt and images together
-    # The C++ processor will automatically:
-    # 1. Preprocess images using processor_config.json pipeline
-    # 2. Insert image tokens in the correct places
-    # 3. Return properly formatted inputs (pixel_values, image_grid_thw, input_ids)
-    inputs = processor(prompt, images=images)
-
-    if "input_ids" in inputs:
-        input_ids_tensor = inputs["input_ids"]
-        input_length = input_ids_tensor.shape()[1]
-    else:
-        input_length = len(tokenizer.encode(prompt))
-
-    # Setup generation parameters
-    try:
-        with open(config_dir / "genai_config.json", "r") as f:
-            config = json.load(f)
-            context_len = config.get("model", {}).get("context_length", 2048)
-            eos_val = config.get("model", {}).get("eos_token_id", [])
-            eos_ids = eos_val if isinstance(eos_val, list) else [eos_val] if eos_val else []
-    except Exception:
-        context_len = 2048
-        eos_ids = []
-    
-    max_length = min(input_length + max_new_tokens, context_len)
-    
-    params = og.GeneratorParams(model)
-    params.set_search_options(max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p,
-                              do_sample=do_sample, min_length=min_length, repetition_penalty=repetition_penalty)
-    
-    # Generate
-    generator = og.Generator(model, params)
-    generator.set_inputs(inputs)
-    output_tokens = []
-    accum_text = ""
-    started_toolcall = False
-    print("\n=== Generating ===")
-    
-    while not generator.is_done():
-        generator.generate_next_token()
-        token = generator.get_next_tokens()[0]
-        output_tokens.append(token)
-        
-        if eos_ids and token in eos_ids and len(output_tokens) >= min_length:
-            break
-        
-        decoded = tokenizer_stream.decode(token)
-        accum_text += decoded
-        
-        if not started_toolcall and "<tool_call>" in accum_text:
-            started_toolcall = True
-            sys.stdout.write(accum_text[accum_text.index("<tool_call>"):])
-            sys.stdout.flush()
-        elif started_toolcall:
-            sys.stdout.write(decoded)
-            sys.stdout.flush()
-            if "</tool_call>" in accum_text:
-                break
-    
-    print("\n=== Generation Complete ===")
-    full_output = processor.decode(output_tokens)
-    
-    if started_toolcall and "</tool_call>" not in accum_text:
-        print("[WARNING] Incomplete <tool_call> structure")
-    
-    print("\nFINAL OUTPUT:", full_output)
-    return full_output
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Qwen2.5-VL inference using onnxruntime-genai")
-    parser.add_argument("--config_dir", "--model_path", type=Path, required=True, help="Directory with genai_config.json")
-    parser.add_argument("--image", type=Path, required=True, help="Path to input image")
-    parser.add_argument("--prompt", type=str, default="Describe the image.", help="User text prompt")
-    parser.add_argument("--max_new_tokens", type=int, default=4096)
-    parser.add_argument("--temperature", type=float, default=0.7)
-    parser.add_argument("--top_k", type=int, default=50)
-    parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--do_sample", action="store_true")
-    parser.add_argument("--min_length", type=int, default=0)
-    parser.add_argument("--repetition_penalty", type=float, default=1.0)
-    args = parser.parse_args()
-
-    run_inference(
-        config_dir=args.config_dir,
-        image_path=args.image,
-        prompt_text=args.prompt,
-        max_new_tokens=args.max_new_tokens,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        do_sample=args.do_sample,
-        min_length=args.min_length,
-        repetition_penalty=args.repetition_penalty,
-    )
-
-if __name__ == "__main__":
-    main()
diff --git a/src/config.cpp b/src/config.cpp
index c3e5e28c69..41b271c36a 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -949,8 +949,6 @@ struct Model_Element : JSON::Element {
       v_.decoder_start_token_id = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "sep_token_id") {
       v_.sep_token_id = static_cast<int>(JSON::Get<double>(value));
-    } else if (name == "image_token_id") {
-      v_.image_token_id = static_cast<int>(JSON::Get<double>(value));
     } else {
       throw JSON::unknown_value_error{};
     }
diff --git a/src/config.h b/src/config.h
index 52b7584314..18b8048cbc 100644
--- a/src/config.h
+++ b/src/config.h
@@ -167,7 +167,7 @@ struct Config {
         std::string model_id;               // Identifier used to link outputs to subsequent stages
         std::vector<std::string> inputs;    // Graph input names
         std::vector<std::string> outputs;   // Graph output names
-        bool run_on_cpu{true};              // If true force CPU EP when multiple EPs are configured
+        bool run_on_cpu{false};              // If true force CPU EP when multiple EPs are configured
       };
       std::vector<PipelineModel> pipeline;  // Ordered pipeline models
 
@@ -276,8 +276,6 @@ struct Config {
       std::vector<PipelineModel> pipeline;
 
     } decoder;
-    // Multi-modal token ids
-    int image_token_id{};  // Image pad token id used for embedding injection
 
   } model;
 
diff --git a/src/models/model.cpp b/src/models/model.cpp
index d5aea73568..adc355f970 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -236,10 +236,6 @@ State::~State() {
 std::vector<int32_t> PadInputs(std::span<std::span<const int32_t>> sequences, int32_t pad_token_id) {
   bool pad_right_{true};
 
-  if (pad_token_id == 0 && g_log.enabled) {
-    Log("warning", "pad_token_id is 0, which may be uninitialized. Verify genai_config.json contains 'pad_token_id' field.");
-  }
-
   size_t max_length = 0;
   for (auto& sequence : sequences)
     max_length = std::max(max_length, sequence.size());
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
index e1bbe61b7b..1efeefb9dc 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -63,7 +63,7 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
 
     int64_t grid_t = 1;  // Single frame
     int64_t grid_h = static_cast<int64_t>(std::sqrt(num_patches));
-    int64_t grid_w = num_patches / grid_h;
+    int64_t grid_w = grid_h;
     
     std::vector<int64_t> grid_thw_shape = {batch_size, 3};
     auto grid_thw_tensor = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index c9fb1237d7..1b9398c5ce 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -70,12 +70,55 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
       image_grid_thw_val = input.tensor->GetOrtTensor();
     }
   }
-  if (!pixel_values_val) return;
+  if (!pixel_values_val) {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision pipeline: pixel_values input not found in extra_inputs");
+    }
+    return;
+  }
 
-  auto pixel_shape = pixel_values_val->GetTensorTypeAndShapeInfo()->GetShape();
+  auto pixel_type_info = pixel_values_val->GetTensorTypeAndShapeInfo();
+  auto pixel_shape = pixel_type_info->GetShape();
+  auto pixel_type = pixel_type_info->GetElementType();
+  
   std::vector<int64_t> pixel_shape_vec(pixel_shape.begin(), pixel_shape.end());
-  const float* pixel_data = pixel_values_val->GetTensorMutableData<float>();
-  if (!pixel_data) return;
+  const float* pixel_data = nullptr;
+  std::vector<float> converted_data;
+  
+  // Convert pixel values to float32 if needed
+  if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+    pixel_data = pixel_values_val->GetTensorData<float>();
+  } else if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
+    // Convert float16 to float32
+    const Ort::Float16_t* fp16_data = pixel_values_val->GetTensorData<Ort::Float16_t>();
+    size_t num_elements = pixel_values_val->GetTensorTypeAndShapeInfo()->GetElementCount();
+    converted_data.resize(num_elements);
+    for (size_t i = 0; i < num_elements; ++i) {
+      converted_data[i] = Float16ToFloat32(fp16_data[i]);
+    }
+    pixel_data = converted_data.data();
+  } else if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
+    // Convert bfloat16 to float32
+    const Ort::BFloat16_t* bf16_data = pixel_values_val->GetTensorData<Ort::BFloat16_t>();
+    size_t num_elements = pixel_values_val->GetTensorTypeAndShapeInfo()->GetElementCount();
+    converted_data.resize(num_elements);
+    for (size_t i = 0; i < num_elements; ++i) {
+      converted_data[i] = BFloat16ToFloat32(bf16_data[i]);
+    }
+    pixel_data = converted_data.data();
+  } else {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision pipeline: unsupported pixel_values type " + std::to_string(pixel_type));
+    }
+    return;
+  }
+  
+  if (!pixel_data) {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision pipeline: failed to access pixel_values tensor data");
+    }
+    return;
+  }
 
   // Extract grid_thw if provided
   std::vector<int64_t> grid_thw;
@@ -112,7 +155,12 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   }
 
   auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape();
-  if (out_shape.size() != 2) return;
+  if (out_shape.size() != 2) {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision pipeline: expected output shape rank 2, got " + std::to_string(out_shape.size()));
+    }
+    return;
+  }
   
   auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
   std::span<float> data_span(image_features_buffer_.data(), image_features_buffer_.size());
@@ -134,7 +182,12 @@ void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32
 void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
                                                      DeviceSpan<int32_t>& input_token_ids) {
   auto it = ortvalue_store_.find(embeddings_output_name);
-  if (it == ortvalue_store_.end() || !it->second) return;
+  if (it == ortvalue_store_.end() || !it->second) {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision embedding injection: embeddings output '" + embeddings_output_name + "' not found in ortvalue_store");
+    }
+    return;
+  }
   
   OrtValue* embeddings_ortvalue = it->second.get();
   auto shape = embeddings_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
@@ -146,11 +199,22 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
   const int64_t embedding_dim = shape[2];
   const int64_t num_vision_tokens = vision_shape[0];
   const int64_t vision_dim = vision_shape[1];
-  if (vision_dim != embedding_dim) return;
+  if (vision_dim != embedding_dim) {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) + 
+                   ", embedding_dim=" + std::to_string(embedding_dim));
+    }
+    return;
+  }
   
-  const int32_t image_token_id = vl_model_.config_->model.image_token_id;
+  constexpr int32_t image_token_id = 151655;
   
-  if (!input_ids_ || !input_ids_->Get()) return;
+  if (!input_ids_ || !input_ids_->Get()) {
+    if (g_log.enabled && g_log.warning) {
+      Log("warning", "Vision embedding injection: input_ids not available");
+    }
+    return;
+  }
   
   OrtValue* input_ids_ortvalue = input_ids_->Get();
   auto input_ids_shape = input_ids_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
diff --git a/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
index 2a7b891028..6cf90b0245 100644
--- a/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
+++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
@@ -75,7 +75,7 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Visible>false</Visible>
     </None>
-    <None Condition="Exists('$(NativeBuildOutputDir)\libonnxruntime.dylib')" Include="$(NativeBuildOutputDir)\libonnxruntime.dylib">
+    <None Condition="Exists('$(OrtLibDir)\libonnxruntime.dylib')" Include="$(OrtLibDir)\libonnxruntime.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Visible>false</Visible>
     </None>

From 9e4147825262538450a105e1463f436a97941b11 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Wed, 3 Dec 2025 17:48:58 -0800
Subject: [PATCH 19/25] Fix lint errors

---
 src/config.cpp                            |  6 +-
 src/config.h                              |  2 +-
 src/generators.cpp                        |  8 +-
 src/models/decoder_only.cpp               |  8 +-
 src/models/kv_cache.cpp                   |  2 +-
 src/models/qwen2_5_vl_image_processor.cpp |  6 +-
 src/models/qwen_vl_model.cpp              | 57 +++++++------
 src/models/qwen_vl_model.h                | 10 +--
 src/models/qwen_vl_vision.cpp             | 99 +++++++++++------------
 src/models/qwen_vl_vision.h               | 12 +--
 10 files changed, 103 insertions(+), 107 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index 41b271c36a..652fc053f5 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -320,7 +320,7 @@ struct DecoderInputs_Element : JSON::Element {
     }
   }
 
-private:
+ private:
   Config::Model::Decoder::Inputs& v_;
 };
 
@@ -610,7 +610,7 @@ struct Decoder_Element : JSON::Element {
   DecoderOutputs_Element outputs_{v_.outputs};
   Pipeline_Element pipeline_{v_.pipeline};
   SlidingWindow_Element sliding_window_{v_.sliding_window};
-  std::unique_ptr<PipelineModelObject_Element> pipeline_object_; // object-style pipeline support
+  std::unique_ptr<PipelineModelObject_Element> pipeline_object_;  // object-style pipeline support
 };
 
 struct VisionInputs_Element : JSON::Element {
@@ -774,7 +774,7 @@ struct Vision_Element : JSON::Element {
   VisionInputs_Element inputs_{v_.inputs};
   VisionOutputs_Element outputs_{v_.outputs};
   VisionPipeline_Element pipeline_element_{v_.pipeline};
-  std::unique_ptr<VisionPipelineModelObject_Element> vision_pipeline_object_; // object-style pipeline support
+  std::unique_ptr<VisionPipelineModelObject_Element> vision_pipeline_object_;  // object-style pipeline support
 };
 
 struct SpeechInputs_Element : JSON::Element {
diff --git a/src/config.h b/src/config.h
index 18b8048cbc..242dd27f5e 100644
--- a/src/config.h
+++ b/src/config.h
@@ -174,7 +174,7 @@ struct Config {
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
-        std::string image_grid_thw{Defaults::ImageSizesName};  // Qwen2.5-VL uses image_grid_thw, defaults to image_sizes
+        std::string image_grid_thw{Defaults::ImageSizesName};          // Qwen2.5-VL uses image_grid_thw, defaults to image_sizes
         std::string attention_mask{Defaults::ImageAttentionMaskName};  // image attention mask
       } inputs;
 
diff --git a/src/generators.cpp b/src/generators.cpp
index 4c0b8cf358..d19751217c 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -318,12 +318,12 @@ DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t>
 
   auto input_ids_device = state_->params_->p_device->Allocate<int32_t>(padded_input_ids_size);
   auto cpu_span = input_ids_device.CpuSpan();
-  
+
   // Handle padding based on alignment setting for sliding window models
   if (padded_input_ids_size > input_ids.size()) {
-    const bool left_align = model_->config_->model.decoder.sliding_window.has_value() && 
-                           model_->config_->model.decoder.sliding_window->alignment == "left";
-    
+    const bool left_align = model_->config_->model.decoder.sliding_window.has_value() &&
+                            model_->config_->model.decoder.sliding_window->alignment == "left";
+
     if (left_align) {
       // Left alignment: padding first, then data
       std::fill_n(cpu_span.begin(), padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id);
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 9892a0698c..b7a571d586 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -86,20 +86,20 @@ void DecoderOnly_State::RewindTo(size_t index) {
 void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> beam_indices, int total_length) {
   input_ids_.Update(next_tokens);
   size_t new_length = static_cast<size_t>(input_ids_.GetShape()[1]);
-  
+
   // Determine effective lengths for position_ids and KV cache based on sliding window config
   int position_length = total_length;
   int kv_cache_length = total_length;
-  
+
   if (model_.config_->model.decoder.sliding_window.has_value() &&
       model_.config_->model.decoder.sliding_window->window_size > 0) {
     const int window_size = model_.config_->model.decoder.sliding_window->window_size;
-    
+
     // Position IDs are clamped when slide_inputs is true
     if (model_.config_->model.decoder.sliding_window->slide_inputs) {
       position_length = std::min(total_length, window_size);
     }
-    
+
     // KV cache is clamped when slide_key_value_cache is true
     if (model_.config_->model.decoder.sliding_window->slide_key_value_cache) {
       kv_cache_length = std::min(total_length, window_size);
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 2fb0ab140c..96f132ef51 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -276,7 +276,7 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
     // Per-layer allocation with per-layer capacity constraints
     for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
       std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
-      
+
       // With buffer sharing: use full capacity (buffers are reused)
       // Without buffer sharing: use actual length for memory efficiency
       if (!past_present_share_buffer_) {
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
index 1efeefb9dc..ccbb1d04ff 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -56,7 +56,7 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
   const int64_t* pixel_values_shape{};
   size_t pixel_values_dims{};
   CheckResult(OrtxGetTensorData(pixel_values, &pixel_values_data, &pixel_values_shape, &pixel_values_dims));
-  
+
   if (pixel_values_dims >= 2) {
     int64_t batch_size = pixel_values_shape[0];
     int64_t num_patches = pixel_values_shape[1];
@@ -67,12 +67,12 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
     
     std::vector<int64_t> grid_thw_shape = {batch_size, 3};
     auto grid_thw_tensor = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
-    
+
     auto* dst = grid_thw_tensor->GetTensorMutableData<int64_t>();
     dst[0] = grid_t;
     dst[1] = grid_h;
     dst[2] = grid_w;
-    
+
     named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(std::move(grid_thw_tensor)));
   }
 
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index 1b9398c5ce..1c81b6a044 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -8,7 +8,7 @@
 namespace Generators {
 
 Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
-  : DecoderOnlyPipelineModel(std::move(config), ort_env) {  
+    : DecoderOnlyPipelineModel(std::move(config), ort_env) {
   if (config_->model.vision.pipeline.empty()) return;
 
   // Find vision pipeline stage paths
@@ -18,11 +18,11 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
     }
     return "";
   };
-  
+
   auto patch_embed_path = find_stage("patch_embed");
   auto vision_attn_path = find_stage("vision_attn");
   auto patch_merger_path = find_stage("patch_merger");
-  
+
   if (patch_embed_path.empty() || vision_attn_path.empty() || patch_merger_path.empty()) return;
 
   // Check if QNN should be used for vision attention
@@ -36,10 +36,10 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
 
   // Default spatial merge size
   constexpr int spatial_merge = 2;
-  
+
   vision_pipeline_ = std::make_unique<QwenVisionPipeline>(
-    ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
-    spatial_merge, use_qnn_attn);
+      ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
+      spatial_merge, use_qnn_attn);
 }
 
 std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
@@ -50,19 +50,19 @@ std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t>
 Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
                                                    DeviceSpan<int32_t> sequence_lengths,
                                                    const GeneratorParams& params)
-  : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} {
+    : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} {
 }
 
-void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {  
+void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {
   DecoderOnlyPipelineState::SetExtraInputs(extra_inputs);
-  
+
   if (vision_ran_ || !vl_model_.vision_pipeline_) return;
 
   OrtValue* pixel_values_val = nullptr;
   OrtValue* image_grid_thw_val = nullptr;
   const auto& pixel_name = vl_model_.config_->model.vision.inputs.pixel_values;
   const auto& grid_thw_name = vl_model_.config_->model.vision.inputs.image_grid_thw;
-  
+
   for (const auto& input : extra_inputs) {
     if (input.name == pixel_name) {
       pixel_values_val = input.tensor->GetOrtTensor();
@@ -125,12 +125,12 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   if (image_grid_thw_val) {
     auto grid_shape = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetShape();
     auto element_type = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetElementType();
-    
+
     if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
       const int64_t* grid_data = image_grid_thw_val->GetTensorData<int64_t>();
       size_t grid_count = 1;
       for (auto dim : grid_shape) grid_count *= dim;
-      
+
       // Expect [batch, 3] or [3] shape - take last 3 values as [t, h, w]
       if (grid_count >= 3) {
         grid_thw = {grid_data[grid_count - 3], grid_data[grid_count - 2], grid_data[grid_count - 1]};
@@ -139,10 +139,10 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
       const int32_t* grid_data = image_grid_thw_val->GetTensorData<int32_t>();
       size_t grid_count = 1;
       for (auto dim : grid_shape) grid_count *= dim;
-      
+
       if (grid_count >= 3) {
-        grid_thw = {static_cast<int64_t>(grid_data[grid_count - 3]), 
-                    static_cast<int64_t>(grid_data[grid_count - 2]), 
+        grid_thw = {static_cast<int64_t>(grid_data[grid_count - 3]),
+                    static_cast<int64_t>(grid_data[grid_count - 2]),
                     static_cast<int64_t>(grid_data[grid_count - 1])};
       }
     }
@@ -172,7 +172,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
 
 void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
   if (stage_id != 0 || !vision_ran_) return;
-  
+
   const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
   if (!embeddings_config.outputs.empty()) {
     InjectVisionEmbeddings(embeddings_config.outputs[0], next_tokens);
@@ -180,7 +180,7 @@ void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32
 }
 
 void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
-                                                     DeviceSpan<int32_t>& input_token_ids) {
+                                                      DeviceSpan<int32_t>& input_token_ids) {
   auto it = ortvalue_store_.find(embeddings_output_name);
   if (it == ortvalue_store_.end() || !it->second) {
     if (g_log.enabled && g_log.warning) {
@@ -192,10 +192,10 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
   OrtValue* embeddings_ortvalue = it->second.get();
   auto shape = embeddings_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
   float* embeddings_data = embeddings_ortvalue->GetTensorMutableData<float>();
-  
+
   auto vision_shape = image_features_value_->GetTensorTypeAndShapeInfo()->GetShape();
   const float* vision_data = image_features_value_->GetTensorData<float>();
-  
+
   const int64_t embedding_dim = shape[2];
   const int64_t num_vision_tokens = vision_shape[0];
   const int64_t vision_dim = vision_shape[1];
@@ -219,27 +219,26 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
   OrtValue* input_ids_ortvalue = input_ids_->Get();
   auto input_ids_shape = input_ids_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
   const int32_t* token_ids_cpu = input_ids_ortvalue->GetTensorData<int32_t>();
-  
+
   int64_t total_tokens = 1;
   for (auto dim : input_ids_shape) total_tokens *= dim;
-  
+
   for (int64_t i = 0; i < total_tokens; ++i) {
     if (token_ids_cpu[i] == image_token_id && image_embed_consumed_ < static_cast<size_t>(num_vision_tokens)) {
-      std::memcpy(embeddings_data + (i * embedding_dim), 
+      std::memcpy(embeddings_data + (i * embedding_dim),
                   vision_data + (image_embed_consumed_ * vision_dim),
                   vision_dim * sizeof(float));
       image_embed_consumed_++;
     }
   }
-  
+
   // Warn if there's a mismatch between image tokens and vision features
   if (image_embed_consumed_ != static_cast<size_t>(num_vision_tokens)) {
-    Log("warning", "Vision embedding mismatch: consumed " + std::to_string(image_embed_consumed_) + 
-                   " of " + std::to_string(num_vision_tokens) + " available vision tokens. " +
-                   "This may indicate a mismatch between the number of image placeholders in the prompt " +
-                   "and the number of images provided.");
+    Log("warning", "Vision embedding mismatch: consumed " + std::to_string(image_embed_consumed_) +
+                       " of " + std::to_string(num_vision_tokens) + " available vision tokens. " +
+                       "This may indicate a mismatch between the number of image placeholders in the prompt " +
+                       "and the number of images provided.");
   }
 }
 
-} // namespace Generators
-
+}  // namespace Generators
diff --git a/src/models/qwen_vl_model.h b/src/models/qwen_vl_model.h
index 27f12649c3..cc8dea7bb7 100644
--- a/src/models/qwen_vl_model.h
+++ b/src/models/qwen_vl_model.h
@@ -31,13 +31,13 @@ struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
 
  private:
   void InjectVisionEmbeddings(const std::string& embeddings_output_name,
-                             DeviceSpan<int32_t>& input_token_ids);
-  
+                              DeviceSpan<int32_t>& input_token_ids);
+
   const Qwen2_5_VL_PipelineModel& vl_model_;
   bool vision_ran_{false};
   std::unique_ptr<OrtValue> image_features_value_;
-  std::vector<float> image_features_buffer_; // backing storage for OrtValue
-  size_t image_embed_consumed_{0}; // Track how many vision embeddings we've injected
+  std::vector<float> image_features_buffer_;  // backing storage for OrtValue
+  size_t image_embed_consumed_{0};            // Track how many vision embeddings we've injected
 };
 
-} // namespace Generators
+}  // namespace Generators
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 730dd7c393..76e100d034 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -22,14 +22,13 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
                                        const std::string& qnn_backend_path,
                                        int64_t patch_size,
                                        int64_t window_size)
-  // Match declaration order to avoid MSVC C5038 warning-as-error
-  : use_qnn_attn_(use_qnn_attn),
-    qnn_backend_path_(qnn_backend_path),
-    spatial_merge_size_(spatial_merge_size),
-    patch_size_(patch_size),
-    window_size_(window_size),
-    env_(env) {
-
+    // Match declaration order to avoid MSVC C5038 warning-as-error
+    : use_qnn_attn_(use_qnn_attn),
+      qnn_backend_path_(qnn_backend_path),
+      spatial_merge_size_(spatial_merge_size),
+      patch_size_(patch_size),
+      window_size_(window_size),
+      env_(env) {
   // Convert std::string model paths to ORTCHAR_T for cross-platform (char or wchar_t)
   auto toOrtPath = [](const std::string& s) -> std::basic_string<ORTCHAR_T> {
     return std::basic_string<ORTCHAR_T>(s.begin(), s.end());
@@ -49,14 +48,13 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
     so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
     // QNN provider options
     const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
-    const char* values[] = { qnn_backend_path_.c_str(), "burst", "3", "60" };
+    const char* values[] = {qnn_backend_path_.c_str(), "burst", "3", "60"};
 
     auto providers = Ort::GetAvailableProviders();
     bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
     if (has_qnn) {
       so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
-    }
-    else {
+    } else {
       // Use registered QNN EP
       size_t num_devices = 0;
       const OrtEpDevice* const* device_ptrs = nullptr;
@@ -73,11 +71,10 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
         throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered.");
       } else {
         Ort::api->SessionOptionsAppendExecutionProvider_V2(
-          so.get(),
-          &GetOrtEnv(),
-          ep_devices_ptrs.data(), ep_devices_ptrs.size(),
-          keys, values, 4
-        );
+            so.get(),
+            &GetOrtEnv(),
+            ep_devices_ptrs.data(), ep_devices_ptrs.size(),
+            keys, values, 4);
       }
     }
 
@@ -96,63 +93,63 @@ std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, si
 
 // Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store).
 
-std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape, 
+std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape,
                                            const std::vector<int64_t>& grid_thw) {
   if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
     throw std::runtime_error("Vision pipeline sessions not initialized");
   }
-  
+
   // Calculate window indices dynamically if grid_thw provided
   if (!grid_thw.empty() && grid_thw.size() == 3) {
     wnd_idx_ = CalculateWindowIndex(grid_thw[0], grid_thw[1], grid_thw[2]);
-    
+
     // Build reverse index (argsort)
     rev_idx_.resize(wnd_idx_.size());
     std::vector<std::pair<int64_t, size_t>> pairs;
     pairs.reserve(wnd_idx_.size());
     for (size_t i = 0; i < wnd_idx_.size(); ++i) pairs.emplace_back(wnd_idx_[i], i);
-    std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b){ return a.first < b.first; });
+    std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b) { return a.first < b.first; });
     for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
   }
-  
+
   size_t pixel_count = 1;
   for (auto d : pixel_shape) pixel_count *= static_cast<size_t>(d);
   auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape);
-  
+
   const char* pe_input_names[] = {"pixel_values"};
-  OrtValue* pe_inputs[] = { pixel_tensor.get() };
+  OrtValue* pe_inputs[] = {pixel_tensor.get()};
 
   const int64_t num_patches = pixel_shape[1];
   const int64_t hidden_dim = 1280;
   std::vector<int64_t> pe_out_shape{num_patches, hidden_dim};
   pe_out_buf_.resize(num_patches * hidden_dim);
   auto pe_out_tensor = CreateTensor(pe_out_buf_.data(), pe_out_buf_.size(), pe_out_shape);
-  
+
   auto pe_out_name = patch_embed_session_->GetOutputName(0);
-  const char* pe_output_names[] = { pe_out_name.c_str() };
-  OrtValue* pe_outputs[] = { pe_out_tensor.get() };
+  const char* pe_output_names[] = {pe_out_name.c_str()};
+  OrtValue* pe_outputs[] = {pe_out_tensor.get()};
 
   patch_embed_session_->Run(nullptr, pe_input_names, pe_inputs, 1, pe_output_names, pe_outputs, 1);
 
   const int64_t seq_len = num_patches;
   const int64_t window_area = spatial_merge_size_ * spatial_merge_size_;
   const int64_t num_windows = seq_len / window_area;
-  
+
   // Apply window reordering if indices available
   reordered_buf_.resize(seq_len * hidden_dim);
-  
+
   if (!wnd_idx_.empty()) {
     // Validate window configuration
     if (seq_len % window_area != 0 || static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
       throw std::runtime_error("Invalid window configuration for vision pipeline");
     }
-    
+
     // Apply window reordering
     for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
       int64_t src_w = wnd_idx_[dst_w];
       if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
       size_t offset_size = window_area * hidden_dim;
-      std::memcpy(reordered_buf_.data() + dst_w * offset_size, 
+      std::memcpy(reordered_buf_.data() + dst_w * offset_size,
                   pe_out_buf_.data() + src_w * offset_size,
                   offset_size * sizeof(float));
     }
@@ -164,33 +161,33 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   std::vector<int64_t> attn_shape{seq_len, hidden_dim};
   auto attn_in_tensor = CreateTensor(reordered_buf_.data(), reordered_buf_.size(), attn_shape);
   const char* attn_input_names[] = {"hidden"};
-  OrtValue* attn_inputs[] = { attn_in_tensor.get() };
+  OrtValue* attn_inputs[] = {attn_in_tensor.get()};
 
   attn_out_buf_.resize(seq_len * hidden_dim);
   auto attn_out_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
   auto attn_out_name = vision_attn_session_->GetOutputName(0);
-  const char* attn_output_names[] = { attn_out_name.c_str() };
-  OrtValue* attn_outputs[] = { attn_out_tensor.get() };
-  
+  const char* attn_output_names[] = {attn_out_name.c_str()};
+  OrtValue* attn_outputs[] = {attn_out_tensor.get()};
+
   vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1);
 
   auto merger_in_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
   const char* merger_input_names[] = {"hidden"};
-  OrtValue* merger_inputs[] = { merger_in_tensor.get() };
-  
+  OrtValue* merger_inputs[] = {merger_in_tensor.get()};
+
   const int64_t merged_seq_len = seq_len / window_area;  // One token per window after merging
   const int64_t merged_hidden = 3584;
   std::vector<int64_t> merger_shape{merged_seq_len, merged_hidden};
   merger_out_buf_.resize(merged_seq_len * merged_hidden);
   auto merger_out_tensor = CreateTensor(merger_out_buf_.data(), merger_out_buf_.size(), merger_shape);
   auto merger_out_name = patch_merger_session_->GetOutputName(0);
-  const char* merger_output_names[] = { merger_out_name.c_str() };
-  OrtValue* merger_outputs[] = { merger_out_tensor.get() };
-  
+  const char* merger_output_names[] = {merger_out_name.c_str()};
+  OrtValue* merger_outputs[] = {merger_out_tensor.get()};
+
   patch_merger_session_->Run(nullptr, merger_input_names, merger_inputs, 1, merger_output_names, merger_outputs, 1);
 
   final_embeddings_buf_.resize(merger_out_buf_.size());
-  
+
   if (!rev_idx_.empty()) {
     // Apply reverse reordering
     if (static_cast<int64_t>(rev_idx_.size()) != num_windows) {
@@ -203,7 +200,7 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
     }
   } else {
     // No reverse reordering - use sequential order
-    std::memcpy(final_embeddings_buf_.data(), merger_out_buf_.data(), 
+    std::memcpy(final_embeddings_buf_.data(), merger_out_buf_.data(),
                 merger_out_buf_.size() * sizeof(float));
   }
 
@@ -219,23 +216,23 @@ std::vector<int64_t> QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, in
   // Calculate LLM grid dimensions after spatial merging
   int64_t llm_grid_h = grid_h / spatial_merge_size_;
   int64_t llm_grid_w = grid_w / spatial_merge_size_;
-  
+
   // Calculate window size at the merged resolution
   int64_t vit_merger_window_size = window_size_ / spatial_merge_size_ / patch_size_;
-  
+
   // Calculate padding needed to fit into windows
   int64_t pad_h = (vit_merger_window_size - (llm_grid_h % vit_merger_window_size)) % vit_merger_window_size;
   int64_t pad_w = (vit_merger_window_size - (llm_grid_w % vit_merger_window_size)) % vit_merger_window_size;
-  
+
   int64_t num_windows_h = (llm_grid_h + pad_h) / vit_merger_window_size;
   int64_t num_windows_w = (llm_grid_w + pad_w) / vit_merger_window_size;
-  
+
   std::vector<int64_t> window_index;
   window_index.reserve(grid_t * llm_grid_h * llm_grid_w);
-  
+
   // Create initial index grid
   std::vector<int64_t> index(grid_t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w), -100);
-  
+
   // Fill non-padded positions with sequential indices
   for (int64_t t = 0; t < grid_t; ++t) {
     for (int64_t h = 0; h < llm_grid_h; ++h) {
@@ -246,7 +243,7 @@ std::vector<int64_t> QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, in
       }
     }
   }
-  
+
   // Reshape into windows: (grid_t, num_windows_h, window_size, num_windows_w, window_size)
   // Then permute to (grid_t, num_windows_h, num_windows_w, window_size, window_size)
   // This groups patches by window instead of by spatial position
@@ -258,7 +255,7 @@ std::vector<int64_t> QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, in
             int64_t h = wh * vit_merger_window_size + ph;
             int64_t w = ww * vit_merger_window_size + pw;
             int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w;
-            
+
             // Only add non-padded indices
             if (index[padded_idx] != -100) {
               window_index.push_back(index[padded_idx]);
@@ -268,8 +265,8 @@ std::vector<int64_t> QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, in
       }
     }
   }
-  
+
   return window_index;
 }
 
-} // namespace Generators
+}  // namespace Generators
diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h
index 9f9a739e25..5db7c17e95 100644
--- a/src/models/qwen_vl_vision.h
+++ b/src/models/qwen_vl_vision.h
@@ -34,8 +34,8 @@ struct QwenVisionPipeline {
   // grid_thw: optional grid dimensions [temporal, height, width] for dynamic window indexing
   // The ONNX model is assumed to accept the provided shape directly as 'pixel_values'.
   // Returns final merged embeddings (shape: [num_image_tokens, hidden_size]).
-  std::vector<float> Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape, 
-                        const std::vector<int64_t>& grid_thw = {});
+  std::vector<float> Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape,
+                         const std::vector<int64_t>& grid_thw = {});
 
   // Shape info from last Run (seq_len, hidden_size). Returns empty vector if Run not called yet.
   std::vector<int64_t> GetLastOutputShape() const {
@@ -46,7 +46,7 @@ struct QwenVisionPipeline {
  private:
   // Internal helpers
   std::unique_ptr<OrtValue> CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const;
-  
+
   // Calculate window indices dynamically based on grid dimensions
   // Returns window_index (reordering indices for windowing)
   std::vector<int64_t> CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w);
@@ -58,8 +58,8 @@ struct QwenVisionPipeline {
   std::vector<int64_t> wnd_idx_;  // window reordering indices (computed dynamically)
   std::vector<int64_t> rev_idx_;  // reverse ordering indices (argsort of wnd_idx)
   int64_t spatial_merge_size_{};
-  int64_t patch_size_{14};        // Vision patch size (typically 14)
-  int64_t window_size_{56};       // Window size for attention (typically 56)
+  int64_t patch_size_{14};   // Vision patch size (typically 14)
+  int64_t window_size_{56};  // Window size for attention (typically 56)
   OrtEnv& env_;
   int64_t last_seq_len_{0};
   int64_t last_hidden_size_{0};
@@ -72,4 +72,4 @@ struct QwenVisionPipeline {
   mutable std::vector<float> final_embeddings_buf_;
 };
 
-} // namespace Generators
+}  // namespace Generators

From 7b569efb6afed4f7c211557102400b5d91eb7a32 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 17:07:26 -0800
Subject: [PATCH 20/25] Comments addressed

---
 examples/python/model-vision.py           | 20 +++----
 src/config.cpp                            |  4 +-
 src/models/qwen2_5_vl_image_processor.cpp | 65 ++++++++++++++---------
 src/models/qwen_vl_model.cpp              | 46 +++++-----------
 src/models/qwen_vl_vision.cpp             | 56 +++++++++++++------
 5 files changed, 103 insertions(+), 88 deletions(-)

diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
index ee2fdfd2d1..76e5cfd406 100644
--- a/examples/python/model-vision.py
+++ b/examples/python/model-vision.py
@@ -1,8 +1,13 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License
 
-import winml
-print(winml.register_execution_providers(ort=False, ort_genai=True))
+try:
+    import winml
+    print(winml.register_execution_providers(ort=False, ort_genai=True))
+except ImportError:
+    print("WinML not available, using default execution providers")
+except Exception as e:
+    print(f"Failed to register WinML execution providers: {e}")
 
 import argparse
 import glob
@@ -137,16 +142,7 @@ def run(args: argparse.Namespace):
 
         print("Generating response...")
         params = og.GeneratorParams(model)
-        if args.max_length:
-            max_length = args.max_length
-        else:
-            try:
-                config_path = Path(args.model_path) / "genai_config.json"
-                with open(config_path, "r") as f:
-                    config = json.load(f)
-                    max_length = config.get("model", {}).get("context_length", 7680)
-            except Exception:
-                max_length = 7680
+        max_length = args.max_length if args.max_length else 7680
         params.set_search_options(max_length=max_length)
 
         generator = og.Generator(model, params)
diff --git a/src/config.cpp b/src/config.cpp
index 652fc053f5..cd5fdf5cf2 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -279,10 +279,10 @@ struct DecoderInputs_Element : JSON::Element {
       v_.input_ids = JSON::Get<std::string_view>(value);
     } else if (name == "inputs_embeds") {
       v_.embeddings = JSON::Get<std::string_view>(value);
-    } else if (name == "position_ids") {
-      v_.position_ids = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
+    } else if (name == "position_ids") {
+      v_.position_ids = JSON::Get<std::string_view>(value);
     } else if (name == "past_key_names") {
       v_.past_key_names = JSON::Get<std::string_view>(value);
     } else if (name == "past_value_names") {
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
index ccbb1d04ff..ef9bdadbef 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -50,30 +50,47 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
   OrtxTensor* pixel_values = nullptr;
   CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values));
 
-  named_tensors->emplace(pixel_values_name_, std::make_shared<Tensor>(ProcessTensor<float>(pixel_values, allocator)));
-
-  const void* pixel_values_data{};
-  const int64_t* pixel_values_shape{};
-  size_t pixel_values_dims{};
-  CheckResult(OrtxGetTensorData(pixel_values, &pixel_values_data, &pixel_values_shape, &pixel_values_dims));
-
-  if (pixel_values_dims >= 2) {
-    int64_t batch_size = pixel_values_shape[0];
-    int64_t num_patches = pixel_values_shape[1];
-
-    int64_t grid_t = 1;  // Single frame
-    int64_t grid_h = static_cast<int64_t>(std::sqrt(num_patches));
-    int64_t grid_w = grid_h;
-    
-    std::vector<int64_t> grid_thw_shape = {batch_size, 3};
-    auto grid_thw_tensor = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
-
-    auto* dst = grid_thw_tensor->GetTensorMutableData<int64_t>();
-    dst[0] = grid_t;
-    dst[1] = grid_h;
-    dst[2] = grid_w;
-
-    named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(std::move(grid_thw_tensor)));
+  auto pixel_values_ortvalue = ProcessTensor<float>(pixel_values, allocator);
+  auto pixel_values_type_shape_info = pixel_values_ortvalue->GetTensorTypeAndShapeInfo();
+  auto pixel_values_shape = pixel_values_type_shape_info->GetShape();
+  named_tensors->emplace(pixel_values_name_, std::make_shared<Tensor>(std::move(pixel_values_ortvalue)));
+
+  // Check if processor returns grid_thw as second output
+  OrtxTensor* grid_thw_tensor = nullptr;
+  auto grid_thw_result = OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor);
+  
+  if (grid_thw_result == extError_t::kOrtxOK && grid_thw_tensor != nullptr) {
+    named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(ProcessTensor<int64_t>(grid_thw_tensor, allocator)));
+  } else {
+    // Fallback: calculate grid_thw from pixel_values shape
+    if (pixel_values_shape.size() >= 2) {
+      int64_t batch_size = pixel_values_shape[0];
+      int64_t num_patches = pixel_values_shape[1];
+
+      int64_t grid_t = 1;  // Single frame for static images
+      int64_t grid_h, grid_w;
+
+      grid_h = static_cast<int64_t>(std::sqrt(static_cast<double>(num_patches)));
+      while (grid_h > 0 && num_patches % grid_h != 0) {
+        grid_h--;
+      }
+      
+      if (grid_h == 0) {
+        throw std::runtime_error("Failed to factorize num_patches for grid calculation");
+      }
+      
+      grid_w = num_patches / grid_h;
+      
+      std::vector<int64_t> grid_thw_shape = {batch_size, 3};
+      auto grid_thw_output = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
+
+      auto* dst = grid_thw_output->GetTensorMutableData<int64_t>();
+      dst[0] = grid_t;
+      dst[1] = grid_h;
+      dst[2] = grid_w;
+
+      named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(std::move(grid_thw_output)));
+    }
   }
 
   return named_tensors;
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index 1c81b6a044..4d9dd533e6 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -4,6 +4,7 @@
 #include "../logging.h"
 #include <iostream>
 #include <cstring>
+#include <algorithm>
 
 namespace Generators {
 
@@ -26,13 +27,11 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
   if (patch_embed_path.empty() || vision_attn_path.empty() || patch_merger_path.empty()) return;
 
   // Check if QNN should be used for vision attention
-  bool use_qnn_attn = false;
-  for (const auto& stage : config_->model.vision.pipeline) {
-    if (stage.model_id == "vision_attn" && !stage.run_on_cpu) {
-      use_qnn_attn = true;
-      break;
-    }
-  }
+  bool use_qnn_attn = std::any_of(config_->model.vision.pipeline.begin(),
+                                   config_->model.vision.pipeline.end(),
+                                   [](const auto& stage) {
+                                     return stage.model_id == "vision_attn" && !stage.run_on_cpu;
+                                   });
 
   // Default spatial merge size
   constexpr int spatial_merge = 2;
@@ -83,34 +82,15 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   
   std::vector<int64_t> pixel_shape_vec(pixel_shape.begin(), pixel_shape.end());
   const float* pixel_data = nullptr;
-  std::vector<float> converted_data;
+  // Convert pixel values to float32 if needed (handles float16, bfloat16, float32)
+  std::unique_ptr<OrtValue> pixel_values_fp32;
   
-  // Convert pixel values to float32 if needed
   if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
     pixel_data = pixel_values_val->GetTensorData<float>();
-  } else if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
-    // Convert float16 to float32
-    const Ort::Float16_t* fp16_data = pixel_values_val->GetTensorData<Ort::Float16_t>();
-    size_t num_elements = pixel_values_val->GetTensorTypeAndShapeInfo()->GetElementCount();
-    converted_data.resize(num_elements);
-    for (size_t i = 0; i < num_elements; ++i) {
-      converted_data[i] = Float16ToFloat32(fp16_data[i]);
-    }
-    pixel_data = converted_data.data();
-  } else if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
-    // Convert bfloat16 to float32
-    const Ort::BFloat16_t* bf16_data = pixel_values_val->GetTensorData<Ort::BFloat16_t>();
-    size_t num_elements = pixel_values_val->GetTensorTypeAndShapeInfo()->GetElementCount();
-    converted_data.resize(num_elements);
-    for (size_t i = 0; i < num_elements; ++i) {
-      converted_data[i] = BFloat16ToFloat32(bf16_data[i]);
-    }
-    pixel_data = converted_data.data();
   } else {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision pipeline: unsupported pixel_values type " + std::to_string(pixel_type));
-    }
-    return;
+    // Use existing Cast() function to convert to float32
+    Cast(*pixel_values_val, pixel_values_fp32, *vl_model_.p_device_inputs_, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+    pixel_data = pixel_values_fp32->GetTensorData<float>();
   }
   
   if (!pixel_data) {
@@ -150,8 +130,8 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
 
   try {
     image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec, grid_thw);
-  } catch (const std::exception&) {
-    return;  // Silent failure - pipeline already logs errors
+  } catch (const std::exception& e) {
+    throw std::runtime_error(std::string("Vision pipeline failed: ") + e.what());
   }
 
   auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape();
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 76e100d034..9d019c3cf9 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -55,25 +55,24 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
     if (has_qnn) {
       so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
     } else {
-      // Use registered QNN EP
-      size_t num_devices = 0;
-      const OrtEpDevice* const* device_ptrs = nullptr;
-      Ort::GetEpDevices(&GetOrtEnv(), &device_ptrs, &num_devices);
-      std::vector<const OrtEpDevice*> ep_devices_ptrs;
-      ep_devices_ptrs.reserve(num_devices);
-      for (size_t i = 0; i < num_devices; ++i) {
-        if (Ort::api->EpDevice_EpName(device_ptrs[i]) == std::string("QNNExecutionProvider")) {
-          ep_devices_ptrs.push_back(device_ptrs[i]);
+      // Use registered QNN EP - use GenAI wrapper APIs
+      auto ep_devices = GetOrtEnv().GetEpDevices();
+      std::vector<const OrtEpDevice*> qnn_devices;
+      qnn_devices.reserve(ep_devices.size());
+      
+      for (const auto* device : ep_devices) {
+        if (device->Name() == "QNNExecutionProvider") {
+          qnn_devices.push_back(device);
         }
       }
 
-      if (ep_devices_ptrs.empty()) {
+      if (qnn_devices.empty()) {
         throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered.");
       } else {
         Ort::api->SessionOptionsAppendExecutionProvider_V2(
             so.get(),
             &GetOrtEnv(),
-            ep_devices_ptrs.data(), ep_devices_ptrs.size(),
+            qnn_devices.data(), qnn_devices.size(),
             keys, values, 4);
       }
     }
@@ -116,7 +115,8 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   for (auto d : pixel_shape) pixel_count *= static_cast<size_t>(d);
   auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape);
 
-  const char* pe_input_names[] = {"pixel_values"};
+  auto pe_in_name = patch_embed_session_->GetInputName(0);
+  const char* pe_input_names[] = {pe_in_name.c_str()};
   OrtValue* pe_inputs[] = {pixel_tensor.get()};
 
   const int64_t num_patches = pixel_shape[1];
@@ -158,12 +158,33 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
     std::memcpy(reordered_buf_.data(), pe_out_buf_.data(), seq_len * hidden_dim * sizeof(float));
   }
 
-  std::vector<int64_t> attn_shape{seq_len, hidden_dim};
+  // Check if vision_attn session expects a different sequence length (fixed shape model)
+  auto attn_input_info = vision_attn_session_->GetInputTypeInfo(0);
+  auto& attn_input_tensor_info = attn_input_info->GetTensorTypeAndShapeInfo();
+  auto attn_expected_shape = attn_input_tensor_info.GetShape();
+  
+  int64_t expected_seq_len = (attn_expected_shape.size() >= 2 && attn_expected_shape[0] > 0) ? attn_expected_shape[0] : seq_len;
+  int64_t actual_seq_len = seq_len;  // Mutable copy for padding adjustments
+  
+  if (expected_seq_len != seq_len) {
+    // Model expects fixed sequence length - need to pad or error
+    if (expected_seq_len > seq_len) {
+      // Pad the reordered buffer with zeros to match model's expected size
+      reordered_buf_.resize(expected_seq_len * hidden_dim, 0.0f);
+      actual_seq_len = expected_seq_len;  // Update actual_seq_len for subsequent operations
+    } else {
+      // Model expects smaller input - this is an error (image too large for fixed-shape model)
+      throw std::runtime_error("Vision attention model input size mismatch");
+    }
+  }
+  
+  std::vector<int64_t> attn_shape{actual_seq_len, hidden_dim};
   auto attn_in_tensor = CreateTensor(reordered_buf_.data(), reordered_buf_.size(), attn_shape);
-  const char* attn_input_names[] = {"hidden"};
+  auto attn_in_name = vision_attn_session_->GetInputName(0);
+  const char* attn_input_names[] = {attn_in_name.c_str()};
   OrtValue* attn_inputs[] = {attn_in_tensor.get()};
 
-  attn_out_buf_.resize(seq_len * hidden_dim);
+  attn_out_buf_.resize(actual_seq_len * hidden_dim);
   auto attn_out_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
   auto attn_out_name = vision_attn_session_->GetOutputName(0);
   const char* attn_output_names[] = {attn_out_name.c_str()};
@@ -172,10 +193,11 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1);
 
   auto merger_in_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
-  const char* merger_input_names[] = {"hidden"};
+  auto merger_in_name = patch_merger_session_->GetInputName(0);
+  const char* merger_input_names[] = {merger_in_name.c_str()};
   OrtValue* merger_inputs[] = {merger_in_tensor.get()};
 
-  const int64_t merged_seq_len = seq_len / window_area;  // One token per window after merging
+  const int64_t merged_seq_len = actual_seq_len / window_area;  // One token per window after merging
   const int64_t merged_hidden = 3584;
   std::vector<int64_t> merger_shape{merged_seq_len, merged_hidden};
   merger_out_buf_.resize(merged_seq_len * merged_hidden);

From eb0f77ed960a36b7b10f7f3810eb301888162168 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 17:10:54 -0800
Subject: [PATCH 21/25] Fix linter

---
 examples/python/model-vision.py           | 17 ++++++++------
 src/config.h                              |  8 +++----
 src/models/qwen2_5_vl_image_processor.cpp |  8 +++----
 src/models/qwen_vl_model.cpp              | 28 +++++++++++------------
 src/models/qwen_vl_vision.cpp             |  8 +++----
 5 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
index 76e5cfd406..04a87c2adf 100644
--- a/examples/python/model-vision.py
+++ b/examples/python/model-vision.py
@@ -3,6 +3,7 @@
 
 try:
     import winml
+
     print(winml.register_execution_providers(ort=False, ort_genai=True))
 except ImportError:
     print("WinML not available, using default execution providers")
@@ -13,6 +14,7 @@
 import glob
 import json
 import os
+import readline
 import time
 from pathlib import Path
 
@@ -48,7 +50,7 @@ def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
 
 
 def _complete(text, state):
-    return (glob.glob(text + "*") + [None])[state]
+    return [*glob.glob(text + "*"), None][state]
 
 
 def run(args: argparse.Namespace):
@@ -71,8 +73,6 @@ def run(args: argparse.Namespace):
     while True:
         if interactive:
             try:
-                import readline
-
                 readline.set_completer_delims(" \t\n;")
                 readline.parse_and_bind("tab: complete")
                 readline.set_completer(_complete)
@@ -102,7 +102,7 @@ def run(args: argparse.Namespace):
         if len(image_paths) == 0:
             print("No image provided")
         else:
-            for i, image_path in enumerate(image_paths):
+            for _, image_path in enumerate(image_paths):
                 if not os.path.exists(image_path):
                     raise FileNotFoundError(f"Image file not found: {image_path}")
                 print(f"Using image: {image_path}")
@@ -125,7 +125,7 @@ def run(args: argparse.Namespace):
             messages.append({"role": "user", "content": content})
         elif model.type in ["qwen2_5_vl", "fara"]:
             messages.append({"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT})
-            content = "".join([f"<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text
+            content = "".join(["<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text
             messages.append({"role": "user", "content": content})
         else:
             # Gemma3-style multimodal: structured content
@@ -190,8 +190,11 @@ def run(args: argparse.Namespace):
         "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
     )
     parser.add_argument(
-        "--max_length", type=int, required=False, default=None,
-        help="Maximum generation length. Defaults to model's context_length from config."
+        "--max_length",
+        type=int,
+        required=False,
+        default=None,
+        help="Maximum generation length. Defaults to model's context_length from config.",
     )
     parser.add_argument(
         "--non-interactive",
diff --git a/src/config.h b/src/config.h
index 242dd27f5e..ccac038c12 100644
--- a/src/config.h
+++ b/src/config.h
@@ -164,10 +164,10 @@ struct Config {
         std::string filename;
         std::optional<SessionOptions> session_options;
         std::optional<RunOptions> run_options;
-        std::string model_id;               // Identifier used to link outputs to subsequent stages
-        std::vector<std::string> inputs;    // Graph input names
-        std::vector<std::string> outputs;   // Graph output names
-        bool run_on_cpu{false};              // If true force CPU EP when multiple EPs are configured
+        std::string model_id;              // Identifier used to link outputs to subsequent stages
+        std::vector<std::string> inputs;   // Graph input names
+        std::vector<std::string> outputs;  // Graph output names
+        bool run_on_cpu{false};            // If true force CPU EP when multiple EPs are configured
       };
       std::vector<PipelineModel> pipeline;  // Ordered pipeline models
 
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
index ef9bdadbef..6752ea4745 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -58,7 +58,7 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
   // Check if processor returns grid_thw as second output
   OrtxTensor* grid_thw_tensor = nullptr;
   auto grid_thw_result = OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor);
-  
+
   if (grid_thw_result == extError_t::kOrtxOK && grid_thw_tensor != nullptr) {
     named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(ProcessTensor<int64_t>(grid_thw_tensor, allocator)));
   } else {
@@ -74,13 +74,13 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
       while (grid_h > 0 && num_patches % grid_h != 0) {
         grid_h--;
       }
-      
+
       if (grid_h == 0) {
         throw std::runtime_error("Failed to factorize num_patches for grid calculation");
       }
-      
+
       grid_w = num_patches / grid_h;
-      
+
       std::vector<int64_t> grid_thw_shape = {batch_size, 3};
       auto grid_thw_output = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
 
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index 4d9dd533e6..1b91c309c9 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -28,10 +28,10 @@ Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> confi
 
   // Check if QNN should be used for vision attention
   bool use_qnn_attn = std::any_of(config_->model.vision.pipeline.begin(),
-                                   config_->model.vision.pipeline.end(),
-                                   [](const auto& stage) {
-                                     return stage.model_id == "vision_attn" && !stage.run_on_cpu;
-                                   });
+                                  config_->model.vision.pipeline.end(),
+                                  [](const auto& stage) {
+                                    return stage.model_id == "vision_attn" && !stage.run_on_cpu;
+                                  });
 
   // Default spatial merge size
   constexpr int spatial_merge = 2;
@@ -79,12 +79,12 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   auto pixel_type_info = pixel_values_val->GetTensorTypeAndShapeInfo();
   auto pixel_shape = pixel_type_info->GetShape();
   auto pixel_type = pixel_type_info->GetElementType();
-  
+
   std::vector<int64_t> pixel_shape_vec(pixel_shape.begin(), pixel_shape.end());
   const float* pixel_data = nullptr;
   // Convert pixel values to float32 if needed (handles float16, bfloat16, float32)
   std::unique_ptr<OrtValue> pixel_values_fp32;
-  
+
   if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
     pixel_data = pixel_values_val->GetTensorData<float>();
   } else {
@@ -92,7 +92,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
     Cast(*pixel_values_val, pixel_values_fp32, *vl_model_.p_device_inputs_, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
     pixel_data = pixel_values_fp32->GetTensorData<float>();
   }
-  
+
   if (!pixel_data) {
     if (g_log.enabled && g_log.warning) {
       Log("warning", "Vision pipeline: failed to access pixel_values tensor data");
@@ -141,7 +141,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
     }
     return;
   }
-  
+
   auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
   std::span<float> data_span(image_features_buffer_.data(), image_features_buffer_.size());
   std::span<const int64_t> shape_span(out_shape.data(), out_shape.size());
@@ -168,7 +168,7 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
     }
     return;
   }
-  
+
   OrtValue* embeddings_ortvalue = it->second.get();
   auto shape = embeddings_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
   float* embeddings_data = embeddings_ortvalue->GetTensorMutableData<float>();
@@ -181,21 +181,21 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
   const int64_t vision_dim = vision_shape[1];
   if (vision_dim != embedding_dim) {
     if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) + 
-                   ", embedding_dim=" + std::to_string(embedding_dim));
+      Log("warning", "Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) +
+                         ", embedding_dim=" + std::to_string(embedding_dim));
     }
     return;
   }
-  
+
   constexpr int32_t image_token_id = 151655;
-  
+
   if (!input_ids_ || !input_ids_->Get()) {
     if (g_log.enabled && g_log.warning) {
       Log("warning", "Vision embedding injection: input_ids not available");
     }
     return;
   }
-  
+
   OrtValue* input_ids_ortvalue = input_ids_->Get();
   auto input_ids_shape = input_ids_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
   const int32_t* token_ids_cpu = input_ids_ortvalue->GetTensorData<int32_t>();
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 9d019c3cf9..0c9d664678 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -59,7 +59,7 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
       auto ep_devices = GetOrtEnv().GetEpDevices();
       std::vector<const OrtEpDevice*> qnn_devices;
       qnn_devices.reserve(ep_devices.size());
-      
+
       for (const auto* device : ep_devices) {
         if (device->Name() == "QNNExecutionProvider") {
           qnn_devices.push_back(device);
@@ -162,10 +162,10 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
   auto attn_input_info = vision_attn_session_->GetInputTypeInfo(0);
   auto& attn_input_tensor_info = attn_input_info->GetTensorTypeAndShapeInfo();
   auto attn_expected_shape = attn_input_tensor_info.GetShape();
-  
+
   int64_t expected_seq_len = (attn_expected_shape.size() >= 2 && attn_expected_shape[0] > 0) ? attn_expected_shape[0] : seq_len;
   int64_t actual_seq_len = seq_len;  // Mutable copy for padding adjustments
-  
+
   if (expected_seq_len != seq_len) {
     // Model expects fixed sequence length - need to pad or error
     if (expected_seq_len > seq_len) {
@@ -177,7 +177,7 @@ std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::v
       throw std::runtime_error("Vision attention model input size mismatch");
     }
   }
-  
+
   std::vector<int64_t> attn_shape{actual_seq_len, hidden_dim};
   auto attn_in_tensor = CreateTensor(reordered_buf_.data(), reordered_buf_.size(), attn_shape);
   auto attn_in_name = vision_attn_session_->GetInputName(0);

From 4e6babc4ac319b7c9d15c4f852d4f04655570e06 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 17:20:42 -0800
Subject: [PATCH 22/25] Fix kv cache condition

---
 src/models/kv_cache.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 96f132ef51..bd85d800a0 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -202,10 +202,7 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
       // Uniform sliding window allocation (backward compatibility)
       shape_[2] = std::min(max_length, sliding_window_size);
     }
-  } else {
-    // Default capacity: allocate full max_length upfront
-    // - With past_present_share_buffer: buffers are reused, so full capacity needed
-    // - Without past_present_share_buffer: buffers are reallocated each step but still sized to max_length
+  } else if (past_present_share_buffer_) {
     shape_[2] = state_.params_->search.max_length;
   }
 

From 2c8b8b21919f70cda87a6aca3bf7ad2c2652d65e Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 19:43:22 -0800
Subject: [PATCH 23/25] Fix using extensions

---
 cmake/deps.txt                            |  2 +-
 examples/python/model-vision.py           | 29 +++++++++------
 src/models/qwen2_5_vl_image_processor.cpp | 43 ++++-------------------
 src/models/qwen_vl_model.cpp              | 32 ++++-------------
 src/models/qwen_vl_vision.cpp             | 18 +++++-----
 5 files changed, 42 insertions(+), 82 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 5ddf7e7e54..2606531c85 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;9424eab4dec2b438642910e27b4f5e9b875b9a5f
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;9f1f67d6d075793a0828b24e73d50803eb657e9a
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d
diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
index 04a87c2adf..acd95f1ff1 100644
--- a/examples/python/model-vision.py
+++ b/examples/python/model-vision.py
@@ -1,15 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License
 
-try:
-    import winml
-
-    print(winml.register_execution_providers(ort=False, ort_genai=True))
-except ImportError:
-    print("WinML not available, using default execution providers")
-except Exception as e:
-    print(f"Failed to register WinML execution providers: {e}")
-
 import argparse
 import glob
 import json
@@ -23,7 +14,7 @@
 # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
 
 # Tool-calling system prompt for Qwen/Fara models
-TOOL_CALL_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
+FARA_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
 
 The functions at your disposal are:
 <tools>
@@ -54,6 +45,16 @@ def _complete(text, state):
 
 
 def run(args: argparse.Namespace):
+    if args.use_winml:
+        try:
+            import winml
+
+            print(winml.register_execution_providers(ort=False, ort_genai=True))
+        except ImportError:
+            print("WinML not available, using default execution providers")
+        except Exception as e:
+            print(f"Failed to register WinML execution providers: {e}")
+
     print("Loading model...")
     config = og.Config(args.model_path)
     if args.execution_provider != "follow_config":
@@ -124,7 +125,7 @@ def run(args: argparse.Namespace):
             content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text
             messages.append({"role": "user", "content": content})
         elif model.type in ["qwen2_5_vl", "fara"]:
-            messages.append({"role": "system", "content": TOOL_CALL_SYSTEM_PROMPT})
+            messages.append({"role": "system", "content": FARA_SYSTEM_PROMPT})
             content = "".join(["<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text
             messages.append({"role": "user", "content": content})
         else:
@@ -202,5 +203,11 @@ def run(args: argparse.Namespace):
         required=False,
         help="Non-interactive mode, mainly for CI usage",
     )
+    parser.add_argument(
+        "--use-winml",
+        action="store_true",
+        required=False,
+        help="Register WinML execution providers before loading the model",
+    )
     args = parser.parse_args()
     run(args)
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
index 6752ea4745..6c7d0a4cd6 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -51,47 +51,16 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
   CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values));
 
   auto pixel_values_ortvalue = ProcessTensor<float>(pixel_values, allocator);
-  auto pixel_values_type_shape_info = pixel_values_ortvalue->GetTensorTypeAndShapeInfo();
-  auto pixel_values_shape = pixel_values_type_shape_info->GetShape();
   named_tensors->emplace(pixel_values_name_, std::make_shared<Tensor>(std::move(pixel_values_ortvalue)));
 
-  // Check if processor returns grid_thw as second output
   OrtxTensor* grid_thw_tensor = nullptr;
-  auto grid_thw_result = OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor);
-
-  if (grid_thw_result == extError_t::kOrtxOK && grid_thw_tensor != nullptr) {
-    named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(ProcessTensor<int64_t>(grid_thw_tensor, allocator)));
-  } else {
-    // Fallback: calculate grid_thw from pixel_values shape
-    if (pixel_values_shape.size() >= 2) {
-      int64_t batch_size = pixel_values_shape[0];
-      int64_t num_patches = pixel_values_shape[1];
-
-      int64_t grid_t = 1;  // Single frame for static images
-      int64_t grid_h, grid_w;
-
-      grid_h = static_cast<int64_t>(std::sqrt(static_cast<double>(num_patches)));
-      while (grid_h > 0 && num_patches % grid_h != 0) {
-        grid_h--;
-      }
-
-      if (grid_h == 0) {
-        throw std::runtime_error("Failed to factorize num_patches for grid calculation");
-      }
-
-      grid_w = num_patches / grid_h;
-
-      std::vector<int64_t> grid_thw_shape = {batch_size, 3};
-      auto grid_thw_output = OrtValue::CreateTensor<int64_t>(allocator, grid_thw_shape);
-
-      auto* dst = grid_thw_output->GetTensorMutableData<int64_t>();
-      dst[0] = grid_t;
-      dst[1] = grid_h;
-      dst[2] = grid_w;
-
-      named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(std::move(grid_thw_output)));
-    }
+  CheckResult(OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor));
+  
+  if (grid_thw_tensor == nullptr) {
+    throw std::runtime_error("grid_thw output not provided");
   }
+  
+  named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(ProcessTensor<int64_t>(grid_thw_tensor, allocator)));
 
   return named_tensors;
 }
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
index 1b91c309c9..c1c8db2750 100644
--- a/src/models/qwen_vl_model.cpp
+++ b/src/models/qwen_vl_model.cpp
@@ -70,10 +70,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
     }
   }
   if (!pixel_values_val) {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision pipeline: pixel_values input not found in extra_inputs");
-    }
-    return;
+    throw std::runtime_error("Vision pipeline: pixel_values input not found in extra_inputs");
   }
 
   auto pixel_type_info = pixel_values_val->GetTensorTypeAndShapeInfo();
@@ -94,10 +91,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
   }
 
   if (!pixel_data) {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision pipeline: failed to access pixel_values tensor data");
-    }
-    return;
+    throw std::runtime_error("Vision pipeline: failed to access pixel_values tensor data");
   }
 
   // Extract grid_thw if provided
@@ -136,10 +130,7 @@ void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& ext
 
   auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape();
   if (out_shape.size() != 2) {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision pipeline: expected output shape rank 2, got " + std::to_string(out_shape.size()));
-    }
-    return;
+    throw std::runtime_error("Vision pipeline: expected output shape rank 2, got " + std::to_string(out_shape.size()));
   }
 
   auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
@@ -163,10 +154,7 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
                                                       DeviceSpan<int32_t>& input_token_ids) {
   auto it = ortvalue_store_.find(embeddings_output_name);
   if (it == ortvalue_store_.end() || !it->second) {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision embedding injection: embeddings output '" + embeddings_output_name + "' not found in ortvalue_store");
-    }
-    return;
+    throw std::runtime_error("Vision embedding injection: embeddings output '" + embeddings_output_name + "' not found in ortvalue_store");
   }
 
   OrtValue* embeddings_ortvalue = it->second.get();
@@ -180,20 +168,14 @@ void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddi
   const int64_t num_vision_tokens = vision_shape[0];
   const int64_t vision_dim = vision_shape[1];
   if (vision_dim != embedding_dim) {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) +
-                         ", embedding_dim=" + std::to_string(embedding_dim));
-    }
-    return;
+    throw std::runtime_error("Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) +
+                             ", embedding_dim=" + std::to_string(embedding_dim));
   }
 
   constexpr int32_t image_token_id = 151655;
 
   if (!input_ids_ || !input_ids_->Get()) {
-    if (g_log.enabled && g_log.warning) {
-      Log("warning", "Vision embedding injection: input_ids not available");
-    }
-    return;
+    throw std::runtime_error("Vision embedding injection: input_ids not available");
   }
 
   OrtValue* input_ids_ortvalue = input_ids_->Get();
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 0c9d664678..7a6804edcc 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -46,13 +46,20 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
     auto so = OrtSessionOptions::Create();
 
     so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
+    
     // QNN provider options
-    const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
-    const char* values[] = {qnn_backend_path_.c_str(), "burst", "3", "60"};
+    std::unordered_map<std::string, std::string> qnn_options = {
+        {"backend_path", qnn_backend_path_},
+        {"htp_performance_mode", "burst"},
+        {"htp_graph_finalization_optimization_mode", "3"},
+        {"soc_model", "60"}
+    };
 
     auto providers = Ort::GetAvailableProviders();
     bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
     if (has_qnn) {
+      const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
+      const char* values[] = {qnn_backend_path_.c_str(), "burst", "3", "60"};
       so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
     } else {
       // Use registered QNN EP - use GenAI wrapper APIs
@@ -68,13 +75,8 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
 
       if (qnn_devices.empty()) {
         throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered.");
-      } else {
-        Ort::api->SessionOptionsAppendExecutionProvider_V2(
-            so.get(),
-            &GetOrtEnv(),
-            qnn_devices.data(), qnn_devices.size(),
-            keys, values, 4);
       }
+      so->AppendExecutionProvider_V2(GetOrtEnv(), qnn_devices, qnn_options);
     }
 
     vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), so.get());

From ff44339a1cf56d40b4b819e9ad0fca2ee5764f6c Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 20:42:10 -0800
Subject: [PATCH 24/25] Address comments

---
 src/models/kv_cache.cpp | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index bd85d800a0..2d3a1f13be 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -273,13 +273,8 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
     // Per-layer allocation with per-layer capacity constraints
     for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
       std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
-
-      // With buffer sharing: use full capacity (buffers are reused)
-      // Without buffer sharing: use actual length for memory efficiency
-      if (!past_present_share_buffer_) {
-        const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
-        current_shape[2] = std::min(total_length, max_cache_length);
-      }
+      const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
+      current_shape[2] = std::min(total_length, max_cache_length);
 
       // Key tensor
       presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
@@ -291,11 +286,7 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
     }
   } else {
     // Uniform allocation
-    // With buffer sharing: use full capacity (buffers are reused)
-    // Without buffer sharing: use actual length for memory efficiency
-    if (!past_present_share_buffer_) {
-      shape_[2] = total_length;
-    }
+    shape_[2] = total_length;
     for (int i = 0; i < layer_count_ * 2; i++) {
       presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
       state_.outputs_[output_index_ + i] = presents_[i].get();

From 0dfda54c40a65fc4edacc327220cc8fb13fd2c14 Mon Sep 17 00:00:00 2001
From: Akshay Sonawane <asonawane@microsoft.com>
Date: Thu, 4 Dec 2025 20:46:46 -0800
Subject: [PATCH 25/25] Fix linter errors

---
 src/models/qwen2_5_vl_image_processor.cpp | 4 ++--
 src/models/qwen_vl_vision.cpp             | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
index 6c7d0a4cd6..599ff946da 100644
--- a/src/models/qwen2_5_vl_image_processor.cpp
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -55,11 +55,11 @@ std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer&
 
   OrtxTensor* grid_thw_tensor = nullptr;
   CheckResult(OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor));
-  
+
   if (grid_thw_tensor == nullptr) {
     throw std::runtime_error("grid_thw output not provided");
   }
-  
+
   named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(ProcessTensor<int64_t>(grid_thw_tensor, allocator)));
 
   return named_tensors;
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
index 7a6804edcc..ca267f00e5 100644
--- a/src/models/qwen_vl_vision.cpp
+++ b/src/models/qwen_vl_vision.cpp
@@ -46,14 +46,13 @@ QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
     auto so = OrtSessionOptions::Create();
 
     so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
-    
+
     // QNN provider options
     std::unordered_map<std::string, std::string> qnn_options = {
         {"backend_path", qnn_backend_path_},
         {"htp_performance_mode", "burst"},
         {"htp_graph_finalization_optimization_mode", "3"},
-        {"soc_model", "60"}
-    };
+        {"soc_model", "60"}};
 
     auto providers = Ort::GetAvailableProviders();
     bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();