diff --git a/cmake/deps.txt b/cmake/deps.txt
index 7e50996352..2606531c85 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;245f6667babf9668b862ac4513c69ea95117c295
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;9f1f67d6d075793a0828b24e73d50803eb657e9a
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d
diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py
index 1fd73a9db2..acd95f1ff1 100644
--- a/examples/python/model-vision.py
+++ b/examples/python/model-vision.py
@@ -5,6 +5,7 @@
 import glob
 import json
 import os
+import readline
 import time
 from pathlib import Path
 
@@ -12,6 +13,20 @@
 
 # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
 
+# Tool-calling system prompt for Qwen/Fara models
+FARA_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls.
+
+The functions at your disposal are:
+<tools>
+{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer based on screenshots.\\n- This is an interface to a web browser. You do not have access to a terminal or applications menu, only the browser.\\n- Some pages, etc. may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click a home page icon and a window doesn't change, try wait and taking another screenshot.\\n- Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n- If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n- Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\\n- When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\\nScreen resolution: 1428x896", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\\n* `key`: Press keyboard keys, like \\"Enter\\", \\"Alt\\", \\"Shift\\", \\"Tab\\", \\"Control\\", \\"Backspace\\", \\"Delete\\", \\"Escape\\", etc. Keys are pressed down in the order given, then released in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `visit_url`: Visit a specified URL.\\n* `web_search`: Perform a web search with a specified query.\\n* `history_back`: Go back to the previous page in the browser history.\\n* `pause_and_memorize_fact`: Pause and memorize a fact for future reference.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate"], "type": "string"}, "keys": {"description": "Keyboard keys to be pressed in order. Required only by `action=key`.", "type": "array"}, "text": {"description": "Text to type. Required only by `action=type`.", "type": "string"}, "press_enter": {"description": "Whether to press the 'Enter' key after typing. Required only by `action=type`.", "type": "boolean"}, "delete_existing_text": {"description": "Whether to delete existing text before typing. Required only by `action=type`.", "type": "boolean"}, "coordinate": {"description": "[x, y]: The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}, "url": {"description": "The URL to visit. Required only by `action=visit_url`.", "type": "string"}, "query": {"description": "The query to search for. Required only by `action=web_search`.", "type": "string"}, "fact": {"description": "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", "type": "string"}, "time": {"description": "Number of seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}}
+</tools>
+
+To make a function call, you should output a json object inside <tool_call></tool_call> XML tags. The json object must contain the function name and its arguments, like this:
+<tool_call>
+{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}
+</tool_call>
+"""
+
 
 def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
     curr_path = Path(current_dir).absolute()
@@ -26,10 +41,20 @@ def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
 
 
 def _complete(text, state):
-    return (glob.glob(text + "*") + [None])[state]
+    return [*glob.glob(text + "*"), None][state]
 
 
 def run(args: argparse.Namespace):
+    if args.use_winml:
+        try:
+            import winml
+
+            print(winml.register_execution_providers(ort=False, ort_genai=True))
+        except ImportError:
+            print("WinML not available, using default execution providers")
+        except Exception as e:
+            print(f"Failed to register WinML execution providers: {e}")
+
     print("Loading model...")
     config = og.Config(args.model_path)
     if args.execution_provider != "follow_config":
@@ -49,8 +74,6 @@ def run(args: argparse.Namespace):
     while True:
         if interactive:
             try:
-                import readline
-
                 readline.set_completer_delims(" \t\n;")
                 readline.parse_and_bind("tab: complete")
                 readline.set_completer(_complete)
@@ -80,7 +103,7 @@ def run(args: argparse.Namespace):
         if len(image_paths) == 0:
             print("No image provided")
         else:
-            for i, image_path in enumerate(image_paths):
+            for _, image_path in enumerate(image_paths):
                 if not os.path.exists(image_path):
                     raise FileNotFoundError(f"Image file not found: {image_path}")
                 print(f"Using image: {image_path}")
@@ -101,6 +124,10 @@ def run(args: argparse.Namespace):
             # Combine all image tags and text into one user message
             content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text
             messages.append({"role": "user", "content": content})
+        elif model.type in ["qwen2_5_vl", "fara"]:
+            messages.append({"role": "system", "content": FARA_SYSTEM_PROMPT})
+            content = "".join(["<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text
+            messages.append({"role": "user", "content": content})
         else:
             # Gemma3-style multimodal: structured content
             content_list = [{"type": "image"} for _ in image_paths]
@@ -116,7 +143,8 @@ def run(args: argparse.Namespace):
 
         print("Generating response...")
         params = og.GeneratorParams(model)
-        params.set_search_options(max_length=7680)
+        max_length = args.max_length if args.max_length else 7680
+        params.set_search_options(max_length=max_length)
 
         generator = og.Generator(model, params)
         generator.set_inputs(inputs)
@@ -162,11 +190,24 @@ def run(args: argparse.Namespace):
     parser.add_argument(
         "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
     )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        required=False,
+        default=None,
+        help="Maximum generation length. Defaults to model's context_length from config.",
+    )
     parser.add_argument(
         "--non-interactive",
         action=argparse.BooleanOptionalAction,
         required=False,
         help="Non-interactive mode, mainly for CI usage",
     )
+    parser.add_argument(
+        "--use-winml",
+        action="store_true",
+        required=False,
+        help="Register WinML execution providers before loading the model",
+    )
     args = parser.parse_args()
     run(args)
diff --git a/src/config.cpp b/src/config.cpp
index 7087819d86..cd5fdf5cf2 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -587,6 +587,11 @@ struct Decoder_Element : JSON::Element {
       v_.sliding_window = Config::Model::Decoder::SlidingWindow{};
       return sliding_window_;
     }
+    // Support object-style pipeline: "pipeline": { "embeddings": { ... }, ... }
+    if (name == "pipeline") {
+      pipeline_object_ = std::make_unique<PipelineModelObject_Element>(v_.pipeline);
+      return *pipeline_object_;
+    }
     throw JSON::unknown_value_error{};
   }
 
@@ -605,6 +610,7 @@ struct Decoder_Element : JSON::Element {
   DecoderOutputs_Element outputs_{v_.outputs};
   Pipeline_Element pipeline_{v_.pipeline};
   SlidingWindow_Element sliding_window_{v_.sliding_window};
+  std::unique_ptr<PipelineModelObject_Element> pipeline_object_;  // object-style pipeline support
 };
 
 struct VisionInputs_Element : JSON::Element {
@@ -615,6 +621,8 @@ struct VisionInputs_Element : JSON::Element {
       v_.pixel_values = JSON::Get<std::string_view>(value);
     } else if (name == "image_sizes") {
       v_.image_sizes = JSON::Get<std::string_view>(value);
+    } else if (name == "image_grid_thw") {
+      v_.image_grid_thw = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
     } else {
@@ -641,6 +649,77 @@ struct VisionOutputs_Element : JSON::Element {
   Config::Model::Vision::Outputs& v_;
 };
 
+// Vision pipeline support structures
+struct VisionPipelineModel_Element : JSON::Element {
+  explicit VisionPipelineModel_Element(Config::Model::Vision::PipelineModel& v) : v_{v} {}
+
+  void OnValue(std::string_view name, JSON::Value value) override {
+    if (name == "filename") {
+      v_.filename = JSON::Get<std::string_view>(value);
+    } else if (name == "run_on_cpu") {
+      v_.run_on_cpu = JSON::Get<bool>(value);
+    } else {
+      throw JSON::unknown_value_error{};
+    }
+  }
+
+  Element& OnObject(std::string_view name) override {
+    if (name == "session_options") {
+      v_.session_options = Config::SessionOptions{};
+      session_options_ = std::make_unique<SessionOptions_Element>(*v_.session_options);
+      return *session_options_;
+    }
+    if (name == "run_options") {
+      v_.run_options = Config::RunOptions{};
+      run_options_ = std::make_unique<RunOptions_Element>(*v_.run_options);
+      return *run_options_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+  Element& OnArray(std::string_view name) override {
+    if (name == "inputs") {
+      return inputs_;
+    }
+    if (name == "outputs") {
+      return outputs_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+ private:
+  Config::Model::Vision::PipelineModel& v_;
+  std::unique_ptr<SessionOptions_Element> session_options_;
+  std::unique_ptr<RunOptions_Element> run_options_;
+  StringArray_Element inputs_{v_.inputs};
+  StringArray_Element outputs_{v_.outputs};
+};
+
+struct VisionPipelineModelObject_Element : JSON::Element {
+  explicit VisionPipelineModelObject_Element(std::vector<Config::Model::Vision::PipelineModel>& v) : v_{v} {}
+
+  Element& OnObject(std::string_view name) override {
+    auto& model = v_.emplace_back();
+    model.model_id = name;
+    elements_.emplace_back(model);
+    return elements_.back();
+  }
+
+ private:
+  std::vector<Config::Model::Vision::PipelineModel>& v_;
+  std::vector<VisionPipelineModel_Element> elements_;
+};
+
+struct VisionPipeline_Element : JSON::Element {
+  explicit VisionPipeline_Element(std::vector<Config::Model::Vision::PipelineModel>& v) : v_{v} {}
+
+  Element& OnObject(std::string_view name) override { return object_; }
+
+ private:
+  std::vector<Config::Model::Vision::PipelineModel>& v_;
+  VisionPipelineModelObject_Element object_{v_};
+};
+
 struct Vision_Element : JSON::Element {
   explicit Vision_Element(Config::Model::Vision& v) : v_{v} {}
 
@@ -673,6 +752,18 @@ struct Vision_Element : JSON::Element {
     if (name == "outputs") {
       return outputs_;
     }
+    // Support object-style pipeline for vision: "pipeline": { "patch_embed": { ... }, ... }
+    if (name == "pipeline") {
+      vision_pipeline_object_ = std::make_unique<VisionPipelineModelObject_Element>(v_.pipeline);
+      return *vision_pipeline_object_;
+    }
+    throw JSON::unknown_value_error{};
+  }
+
+  Element& OnArray(std::string_view name) override {
+    if (name == "pipeline") {
+      return pipeline_element_;
+    }
     throw JSON::unknown_value_error{};
   }
 
@@ -682,6 +773,8 @@ struct Vision_Element : JSON::Element {
   std::unique_ptr<RunOptions_Element> run_options_;
   VisionInputs_Element inputs_{v_.inputs};
   VisionOutputs_Element outputs_{v_.outputs};
+  VisionPipeline_Element pipeline_element_{v_.pipeline};
+  std::unique_ptr<VisionPipelineModelObject_Element> vision_pipeline_object_;  // object-style pipeline support
 };
 
 struct SpeechInputs_Element : JSON::Element {
@@ -1212,19 +1305,14 @@ void ClearDecoderProviderOptionsHardwareVendorId(Config& config, std::string_vie
 struct Root_Element : JSON::Element {
   explicit Root_Element(Config& config) : config_{config} {}
 
-  void OnValue(std::string_view name, JSON::Value value) override {
+  void OnValue(std::string_view /*name*/, JSON::Value /*value*/) override {
+    // No top-level scalar values currently supported
   }
 
   Element& OnObject(std::string_view name) override {
-    if (name == "model") {
-      return model_element_;
-    }
-    if (name == "search") {
-      return search_element_;
-    }
-    if (name == "engine") {
-      return engine_element_;
-    }
+    if (name == "model") return model_element_;
+    if (name == "search") return search_element_;
+    if (name == "engine") return engine_element_;
     throw JSON::unknown_value_error{};
   }
 
diff --git a/src/config.h b/src/config.h
index 507d7c80c1..ccac038c12 100644
--- a/src/config.h
+++ b/src/config.h
@@ -159,9 +159,22 @@ struct Config {
       std::string config_filename{"processor_config.json"};
       std::optional<std::string> adapter_filename{};
 
+      // Vision pipeline support (patch embed -> vision attn -> patch merger)
+      struct PipelineModel {
+        std::string filename;
+        std::optional<SessionOptions> session_options;
+        std::optional<RunOptions> run_options;
+        std::string model_id;              // Identifier used to link outputs to subsequent stages
+        std::vector<std::string> inputs;   // Graph input names
+        std::vector<std::string> outputs;  // Graph output names
+        bool run_on_cpu{false};            // If true force CPU EP when multiple EPs are configured
+      };
+      std::vector<PipelineModel> pipeline;  // Ordered pipeline models
+
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
+        std::string image_grid_thw{Defaults::ImageSizesName};          // Qwen2.5-VL uses image_grid_thw, defaults to image_sizes
         std::string attention_mask{Defaults::ImageAttentionMaskName};  // image attention mask
       } inputs;
 
diff --git a/src/generators.cpp b/src/generators.cpp
index 85cd5cd26a..d19751217c 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -318,14 +318,24 @@ DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t>
 
   auto input_ids_device = state_->params_->p_device->Allocate<int32_t>(padded_input_ids_size);
   auto cpu_span = input_ids_device.CpuSpan();
-  auto padding_begin = cpu_span.begin();
-  auto data_end = cpu_span.end();
-  if (model_->config_->model.decoder.sliding_window.has_value() && model_->config_->model.decoder.sliding_window->alignment == "left") {
-    padding_begin = cpu_span.begin() + input_ids.size();
-    data_end = padding_begin;
+
+  // Handle padding based on alignment setting for sliding window models
+  if (padded_input_ids_size > input_ids.size()) {
+    const bool left_align = model_->config_->model.decoder.sliding_window.has_value() &&
+                            model_->config_->model.decoder.sliding_window->alignment == "left";
+
+    if (left_align) {
+      // Left alignment: padding first, then data
+      std::fill_n(cpu_span.begin(), padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id);
+      std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin() + (padded_input_ids_size - input_ids.size()));
+    } else {
+      // Right alignment (default): data first, then padding
+      std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin());
+      std::fill(cpu_span.begin() + input_ids.size(), cpu_span.end(), model_->config_->model.pad_token_id);
+    }
+  } else {
+    std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin());
   }
-  std::fill_n(padding_begin, padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id);
-  std::copy_backward(input_ids.begin(), input_ids.end(), data_end);
   input_ids_device.CopyCpuToDevice();
   return input_ids_device;
 }
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index ac12572a0c..b7a571d586 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -16,9 +16,9 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, DeviceSpan<
     : State{params, model},
       model_{model},
       kv_cache_(CreateKeyValueCache(*this)),
-      position_inputs_{model, *this, sequence_lengths_unk, model_.config_->model.decoder.inputs.attention_mask} {
+      position_inputs_{CreatePositionInputs(*this, sequence_lengths_unk, model_.config_->model.decoder.inputs.attention_mask)} {
   input_ids_.Add();
-  position_inputs_.Add();
+  position_inputs_->Add();
   logits_.Add();
   kv_cache_->Add();
 }
@@ -79,15 +79,35 @@ DeviceSpan<float> DecoderOnly_State::RunWithChunking(int total_length, DeviceSpa
 }
 
 void DecoderOnly_State::RewindTo(size_t index) {
-  position_inputs_.RewindTo(index);
+  position_inputs_->RewindTo(index);
   kv_cache_->RewindTo(index);
 }
 
 void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> beam_indices, int total_length) {
   input_ids_.Update(next_tokens);
   size_t new_length = static_cast<size_t>(input_ids_.GetShape()[1]);
-  position_inputs_.Update(next_tokens, total_length, static_cast<int>(new_length));
-  kv_cache_->Update(beam_indices, total_length);
+
+  // Determine effective lengths for position_ids and KV cache based on sliding window config
+  int position_length = total_length;
+  int kv_cache_length = total_length;
+
+  if (model_.config_->model.decoder.sliding_window.has_value() &&
+      model_.config_->model.decoder.sliding_window->window_size > 0) {
+    const int window_size = model_.config_->model.decoder.sliding_window->window_size;
+
+    // Position IDs are clamped when slide_inputs is true
+    if (model_.config_->model.decoder.sliding_window->slide_inputs) {
+      position_length = std::min(total_length, window_size);
+    }
+
+    // KV cache is clamped when slide_key_value_cache is true
+    if (model_.config_->model.decoder.sliding_window->slide_key_value_cache) {
+      kv_cache_length = std::min(total_length, window_size);
+    }
+  }
+
+  position_inputs_->Update(next_tokens, position_length, static_cast<int>(new_length));
+  kv_cache_->Update(beam_indices, kv_cache_length);
   logits_.Update(next_tokens, new_length);
 }
 
diff --git a/src/models/decoder_only.h b/src/models/decoder_only.h
index a61fb2b8be..0869756ae2 100644
--- a/src/models/decoder_only.h
+++ b/src/models/decoder_only.h
@@ -36,7 +36,7 @@ struct DecoderOnly_State : State {
   DefaultInputIDs input_ids_{*this};
   Logits logits_{*this};
   std::unique_ptr<KeyValueCache> kv_cache_;
-  DefaultPositionInputs position_inputs_;
+  std::unique_ptr<PositionInputs> position_inputs_;
   ExtraInputs extra_inputs_{*this};
 };
 
diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp
index 6e996192b9..497bd1b295 100644
--- a/src/models/decoder_only_pipeline.cpp
+++ b/src/models/decoder_only_pipeline.cpp
@@ -112,8 +112,8 @@ DecoderOnlyPipelineState::DecoderOnlyPipelineState(const DecoderOnlyPipelineMode
                                                    DeviceSpan<int32_t> sequence_lengths,
                                                    const GeneratorParams& params)
     : State{params, model},
-      model_{model},
       input_ids_{CreateInputIDs(*this)},
+      model_{model},
       key_value_cache_{CreateKeyValueCache(*this)},
       do_key_value_cache_partial_update_{key_value_cache_ && key_value_cache_->IsPartialUpdateSupported()},
       position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} {
diff --git a/src/models/decoder_only_pipeline.h b/src/models/decoder_only_pipeline.h
index 70160be432..17c173e711 100644
--- a/src/models/decoder_only_pipeline.h
+++ b/src/models/decoder_only_pipeline.h
@@ -69,6 +69,17 @@ struct DecoderOnlyPipelineState : State {
   void RunPipeline(int total_length, DeviceSpan<int32_t>& next_tokens,
                    DeviceSpan<int32_t> next_indices, bool is_last_chunk);
 
+ protected:
+  // Virtual hook called after each pipeline stage completes, before next stage starts.
+  // Allows derived classes to modify stage outputs (e.g., inject vision embeddings).
+  // stage_id: ID of the stage that just completed
+  // next_tokens: current input tokens for pipeline
+  virtual void OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {}
+
+  // Stores all the outputs from the previous pipeline state(s)
+  std::unordered_map<std::string, std::unique_ptr<OrtValue>> ortvalue_store_;
+  std::unique_ptr<InputIDs> input_ids_;  // Made protected for derived class access
+
  private:
   void UpdateKeyValueCache(DeviceSpan<int32_t> beam_indices, int total_length);
 
@@ -86,10 +97,6 @@ struct DecoderOnlyPipelineState : State {
   std::map<size_t, size_t> pipeline_state_id_to_partial_kv_cache_update_record_idx_;
   std::vector<PartialKeyValueCacheUpdateRecord> partial_kv_cache_update_records_;
 
-  // Stores all the outputs from the previous pipeline state(s)
-  std::unordered_map<std::string, std::unique_ptr<OrtValue>> ortvalue_store_;
-
-  std::unique_ptr<InputIDs> input_ids_;
   Logits logits_{*this};
 
   std::unique_ptr<KeyValueCache> key_value_cache_;
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index dbdbd828d6..2d3a1f13be 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -270,13 +270,11 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
   }
 
   if (!layer_shapes_.empty()) {
-    // Update per-layer shapes based on total_length, but respect max allocations
+    // Per-layer allocation with per-layer capacity constraints
     for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) {
-      const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
-      const int actual_length = std::min(total_length, max_cache_length);
-
       std::array<int64_t, 4> current_shape = layer_shapes_[layer_idx];
-      current_shape[2] = actual_length;
+      const int max_cache_length = static_cast<int>(layer_shapes_[layer_idx][2]);
+      current_shape[2] = std::min(total_length, max_cache_length);
 
       // Key tensor
       presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_);
@@ -287,7 +285,7 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
       state_.outputs_[output_index_ + layer_idx * 2 + 1] = presents_[layer_idx * 2 + 1].get();
     }
   } else {
-    // Uniform shape update (existing behavior)
+    // Uniform allocation
     shape_[2] = total_length;
     for (int i = 0; i < layer_count_ * 2; i++) {
       presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 77e6c82657..adc355f970 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -19,6 +19,8 @@
 #include "multi_modal.h"
 #include "marian.h"
 #include "decoder_only_pipeline.h"
+#include "qwen_vl_model.h"
+#include "qwen2_5_vl_image_processor.h"
 #include "../dml/interface.h"
 
 #if defined(_WIN32)
@@ -1193,6 +1195,8 @@ std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, con
 }
 
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, std::unique_ptr<Config> config) {
+  if (config->model.type == "fara" || config->model.type == "qwen2_5_vl")
+    return std::make_shared<Qwen2_5_VL_PipelineModel>(std::move(config), ort_env);
   if (config->model.type == "gpt2")
     return std::make_shared<Gpt_Model>(std::move(config), ort_env);
   if (ModelType::IsLLM(config->model.type))
@@ -1288,7 +1292,9 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
           {"phi3v", Processor::Create<PhiImageProcessor>},
           {"whisper", Processor::Create<WhisperProcessor>},
           {"phi4mm", Processor::Create<PhiMultiModalProcessor>},
-          {"gemma3", Processor::Create<GemmaImageProcessor>}} {
+          {"gemma3", Processor::Create<GemmaImageProcessor>},
+          {"fara", Processor::Create<Qwen2_5VLImageProcessor>},
+          {"qwen2_5_vl", Processor::Create<Qwen2_5VLImageProcessor>}} {
   auto processor = processor_factory_.find(config.model.type);
   if (processor != processor_factory_.end()) {
     processor_ = processor->second(config, session_info);
diff --git a/src/models/model_type.h b/src/models/model_type.h
index c7c4d2f691..a21c4156cf 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,7 +18,7 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 2> VLM = {"gemma3", "phi3v"};
+    static constexpr std::array<std::string_view, 4> VLM = {"fara", "gemma3", "phi3v", "qwen2_5_vl"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 
diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp
new file mode 100644
index 0000000000..599ff946da
--- /dev/null
+++ b/src/models/qwen2_5_vl_image_processor.cpp
@@ -0,0 +1,68 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "../generators.h"
+#include "model.h"
+#include "qwen2_5_vl_image_processor.h"
+#include <numeric>
+
+namespace Generators {
+
+Qwen2_5VLImageProcessor::Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info) {
+  const auto processor_config = (config.config_path / fs::path("processor_config.json")).string();
+  if (!fs::exists(config.config_path / fs::path("processor_config.json"))) {
+    throw std::runtime_error("processor_config.json not found at: " + processor_config);
+  }
+
+  CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str()));
+
+  auto input_names = session_info.GetInputNames();
+  for (const auto& input_name : input_names) {
+    if (input_name.find("pixel_values") != std::string::npos) {
+      pixel_values_name_ = input_name;
+    } else if (input_name.find("image_grid_thw") != std::string::npos) {
+      image_grid_thw_name_ = input_name;
+    }
+  }
+}
+
+std::unique_ptr<NamedTensors> Qwen2_5VLImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
+  if (!payload.images) {
+    throw std::runtime_error("No images provided to Qwen2.5VLImageProcessor");
+  }
+
+  std::string prompt = std::string(payload.prompt);
+  Ort::Allocator& allocator{Ort::Allocator::GetWithDefaultOptions()};
+  auto named_tensors = std::make_unique<NamedTensors>();
+
+  const std::vector<int32_t> input_ids = tokenizer.Encode(prompt.c_str());
+  std::unique_ptr<OrtValue> input_ids_value = OrtValue::CreateTensor<int32_t>(
+      allocator, std::vector<int64_t>{1, static_cast<int64_t>(input_ids.size())});
+  std::copy(input_ids.begin(), input_ids.end(), input_ids_value->GetTensorMutableData<int32_t>());
+  named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared<Tensor>(std::move(input_ids_value)));
+
+  // Run image preprocessing using onnxruntime-extensions
+  // This will execute the full pipeline from processor_config.json:
+  // DecodeImage -> ConvertRGB -> Resize (smart_resize) -> Rescale -> Normalize -> PatchImage
+  ort_extensions::OrtxObjectPtr<OrtxTensorResult> result;
+  CheckResult(OrtxImagePreProcess(processor_.get(), payload.images->images_.get(), result.ToBeAssigned()));
+
+  OrtxTensor* pixel_values = nullptr;
+  CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values));
+
+  auto pixel_values_ortvalue = ProcessTensor<float>(pixel_values, allocator);
+  named_tensors->emplace(pixel_values_name_, std::make_shared<Tensor>(std::move(pixel_values_ortvalue)));
+
+  OrtxTensor* grid_thw_tensor = nullptr;
+  CheckResult(OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor));
+
+  if (grid_thw_tensor == nullptr) {
+    throw std::runtime_error("grid_thw output not provided");
+  }
+
+  named_tensors->emplace(image_grid_thw_name_, std::make_shared<Tensor>(ProcessTensor<int64_t>(grid_thw_tensor, allocator)));
+
+  return named_tensors;
+}
+
+}  // namespace Generators
diff --git a/src/models/qwen2_5_vl_image_processor.h b/src/models/qwen2_5_vl_image_processor.h
new file mode 100644
index 0000000000..85430dbb5d
--- /dev/null
+++ b/src/models/qwen2_5_vl_image_processor.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "model.h"
+#include "processor.h"
+#include "ortx_processor.h"
+
+namespace Generators {
+
+struct Qwen2_5VLImageProcessor : Processor {
+  Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info);
+
+  std::unique_ptr<NamedTensors> Process(const Tokenizer& tokenizer, const Payload& payload) const override;
+
+ private:
+  ort_extensions::OrtxObjectPtr<OrtxProcessor> processor_;
+  std::string pixel_values_name_{"pixel_values"};
+  std::string image_grid_thw_name_{"image_grid_thw"};
+};
+
+}  // namespace Generators
diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp
new file mode 100644
index 0000000000..c1c8db2750
--- /dev/null
+++ b/src/models/qwen_vl_model.cpp
@@ -0,0 +1,206 @@
+#include "qwen_vl_model.h"
+#include "model.h"
+#include "onnxruntime_api.h"
+#include "../logging.h"
+#include <iostream>
+#include <cstring>
+#include <algorithm>
+
+namespace Generators {
+
+Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env)
+    : DecoderOnlyPipelineModel(std::move(config), ort_env) {
+  if (config_->model.vision.pipeline.empty()) return;
+
+  // Find vision pipeline stage paths
+  auto find_stage = [&](const std::string& id) -> std::string {
+    for (const auto& stage : config_->model.vision.pipeline) {
+      if (stage.model_id == id) return (config_->config_path / fs::path(stage.filename)).string();
+    }
+    return "";
+  };
+
+  auto patch_embed_path = find_stage("patch_embed");
+  auto vision_attn_path = find_stage("vision_attn");
+  auto patch_merger_path = find_stage("patch_merger");
+
+  if (patch_embed_path.empty() || vision_attn_path.empty() || patch_merger_path.empty()) return;
+
+  // Check if QNN should be used for vision attention
+  bool use_qnn_attn = std::any_of(config_->model.vision.pipeline.begin(),
+                                  config_->model.vision.pipeline.end(),
+                                  [](const auto& stage) {
+                                    return stage.model_id == "vision_attn" && !stage.run_on_cpu;
+                                  });
+
+  // Default spatial merge size
+  constexpr int spatial_merge = 2;
+
+  vision_pipeline_ = std::make_unique<QwenVisionPipeline>(
+      ort_env, patch_embed_path, vision_attn_path, patch_merger_path,
+      spatial_merge, use_qnn_attn);
+}
+
+std::unique_ptr<State> Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan<int32_t> sequence_lengths,
+                                                             const GeneratorParams& params) const {
+  return std::make_unique<Qwen2_5_VL_PipelineState>(*this, sequence_lengths, params);
+}
+
+Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
+                                                   DeviceSpan<int32_t> sequence_lengths,
+                                                   const GeneratorParams& params)
+    : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} {
+}
+
+void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) {
+  DecoderOnlyPipelineState::SetExtraInputs(extra_inputs);
+
+  if (vision_ran_ || !vl_model_.vision_pipeline_) return;
+
+  OrtValue* pixel_values_val = nullptr;
+  OrtValue* image_grid_thw_val = nullptr;
+  const auto& pixel_name = vl_model_.config_->model.vision.inputs.pixel_values;
+  const auto& grid_thw_name = vl_model_.config_->model.vision.inputs.image_grid_thw;
+
+  for (const auto& input : extra_inputs) {
+    if (input.name == pixel_name) {
+      pixel_values_val = input.tensor->GetOrtTensor();
+    } else if (input.name == grid_thw_name) {
+      image_grid_thw_val = input.tensor->GetOrtTensor();
+    }
+  }
+  if (!pixel_values_val) {
+    throw std::runtime_error("Vision pipeline: pixel_values input not found in extra_inputs");
+  }
+
+  auto pixel_type_info = pixel_values_val->GetTensorTypeAndShapeInfo();
+  auto pixel_shape = pixel_type_info->GetShape();
+  auto pixel_type = pixel_type_info->GetElementType();
+
+  std::vector<int64_t> pixel_shape_vec(pixel_shape.begin(), pixel_shape.end());
+  const float* pixel_data = nullptr;
+  // Convert pixel values to float32 if needed (handles float16, bfloat16, float32)
+  std::unique_ptr<OrtValue> pixel_values_fp32;
+
+  if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+    pixel_data = pixel_values_val->GetTensorData<float>();
+  } else {
+    // Use existing Cast() function to convert to float32
+    Cast(*pixel_values_val, pixel_values_fp32, *vl_model_.p_device_inputs_, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+    pixel_data = pixel_values_fp32->GetTensorData<float>();
+  }
+
+  if (!pixel_data) {
+    throw std::runtime_error("Vision pipeline: failed to access pixel_values tensor data");
+  }
+
+  // Extract grid_thw if provided
+  std::vector<int64_t> grid_thw;
+  if (image_grid_thw_val) {
+    auto grid_shape = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetShape();
+    auto element_type = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetElementType();
+
+    if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+      const int64_t* grid_data = image_grid_thw_val->GetTensorData<int64_t>();
+      size_t grid_count = 1;
+      for (auto dim : grid_shape) grid_count *= dim;
+
+      // Expect [batch, 3] or [3] shape - take last 3 values as [t, h, w]
+      if (grid_count >= 3) {
+        grid_thw = {grid_data[grid_count - 3], grid_data[grid_count - 2], grid_data[grid_count - 1]};
+      }
+    } else if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+      const int32_t* grid_data = image_grid_thw_val->GetTensorData<int32_t>();
+      size_t grid_count = 1;
+      for (auto dim : grid_shape) grid_count *= dim;
+
+      if (grid_count >= 3) {
+        grid_thw = {static_cast<int64_t>(grid_data[grid_count - 3]),
+                    static_cast<int64_t>(grid_data[grid_count - 2]),
+                    static_cast<int64_t>(grid_data[grid_count - 1])};
+      }
+    }
+  }
+
+  try {
+    image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec, grid_thw);
+  } catch (const std::exception& e) {
+    throw std::runtime_error(std::string("Vision pipeline failed: ") + e.what());
+  }
+
+  auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape();
+  if (out_shape.size() != 2) {
+    throw std::runtime_error("Vision pipeline: expected output shape rank 2, got " + std::to_string(out_shape.size()));
+  }
+
+  auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+  std::span<float> data_span(image_features_buffer_.data(), image_features_buffer_.size());
+  std::span<const int64_t> shape_span(out_shape.data(), out_shape.size());
+  image_features_value_ = OrtValue::CreateTensor<float>(*mem_info, data_span, shape_span);
+
+  vision_ran_ = true;
+}
+
+void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) {
+  if (stage_id != 0 || !vision_ran_) return;
+
+  const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0];
+  if (!embeddings_config.outputs.empty()) {
+    InjectVisionEmbeddings(embeddings_config.outputs[0], next_tokens);
+  }
+}
+
+void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name,
+                                                      DeviceSpan<int32_t>& input_token_ids) {
+  auto it = ortvalue_store_.find(embeddings_output_name);
+  if (it == ortvalue_store_.end() || !it->second) {
+    throw std::runtime_error("Vision embedding injection: embeddings output '" + embeddings_output_name + "' not found in ortvalue_store");
+  }
+
+  OrtValue* embeddings_ortvalue = it->second.get();
+  auto shape = embeddings_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
+  float* embeddings_data = embeddings_ortvalue->GetTensorMutableData<float>();
+
+  auto vision_shape = image_features_value_->GetTensorTypeAndShapeInfo()->GetShape();
+  const float* vision_data = image_features_value_->GetTensorData<float>();
+
+  const int64_t embedding_dim = shape[2];
+  const int64_t num_vision_tokens = vision_shape[0];
+  const int64_t vision_dim = vision_shape[1];
+  if (vision_dim != embedding_dim) {
+    throw std::runtime_error("Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) +
+                             ", embedding_dim=" + std::to_string(embedding_dim));
+  }
+
+  constexpr int32_t image_token_id = 151655;
+
+  if (!input_ids_ || !input_ids_->Get()) {
+    throw std::runtime_error("Vision embedding injection: input_ids not available");
+  }
+
+  OrtValue* input_ids_ortvalue = input_ids_->Get();
+  auto input_ids_shape = input_ids_ortvalue->GetTensorTypeAndShapeInfo()->GetShape();
+  const int32_t* token_ids_cpu = input_ids_ortvalue->GetTensorData<int32_t>();
+
+  int64_t total_tokens = 1;
+  for (auto dim : input_ids_shape) total_tokens *= dim;
+
+  for (int64_t i = 0; i < total_tokens; ++i) {
+    if (token_ids_cpu[i] == image_token_id && image_embed_consumed_ < static_cast<size_t>(num_vision_tokens)) {
+      std::memcpy(embeddings_data + (i * embedding_dim),
+                  vision_data + (image_embed_consumed_ * vision_dim),
+                  vision_dim * sizeof(float));
+      image_embed_consumed_++;
+    }
+  }
+
+  // Warn if there's a mismatch between image tokens and vision features
+  if (image_embed_consumed_ != static_cast<size_t>(num_vision_tokens)) {
+    Log("warning", "Vision embedding mismatch: consumed " + std::to_string(image_embed_consumed_) +
+                       " of " + std::to_string(num_vision_tokens) + " available vision tokens. " +
+                       "This may indicate a mismatch between the number of image placeholders in the prompt " +
+                       "and the number of images provided.");
+  }
+}
+
+}  // namespace Generators
diff --git a/src/models/qwen_vl_model.h b/src/models/qwen_vl_model.h
new file mode 100644
index 0000000000..cc8dea7bb7
--- /dev/null
+++ b/src/models/qwen_vl_model.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "decoder_only_pipeline.h"
+#include "qwen_vl_vision.h"
+
+namespace Generators {
+
+// Qwen2.5-VL pipeline model integrating vision pipeline + decoder pipeline.
+// Loads decoder pipeline sessions (handled by base) and constructs vision pipeline sessions.
+// State runs vision once (on first SetExtraInputs when pixel_values arrives) to produce image_features
+// which are injected into embeddings output via existing injection logic in DecoderOnlyPipelineState.
+struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel {
+  Qwen2_5_VL_PipelineModel(std::unique_ptr<Config> config, OrtEnv& ort_env);
+
+  std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths,
+                                     const GeneratorParams& params) const override;
+
+  // Vision pipeline shared across states (sessions reused).
+  std::unique_ptr<QwenVisionPipeline> vision_pipeline_;
+};
+
+struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState {
+  Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model,
+                           DeviceSpan<int32_t> sequence_lengths,
+                           const GeneratorParams& params);
+
+  void SetExtraInputs(const std::vector<ExtraInput>& extra_inputs) override;
+
+ protected:
+  void OnStageComplete(size_t stage_id, DeviceSpan<int32_t>& next_tokens) override;
+
+ private:
+  void InjectVisionEmbeddings(const std::string& embeddings_output_name,
+                              DeviceSpan<int32_t>& input_token_ids);
+
+  const Qwen2_5_VL_PipelineModel& vl_model_;
+  bool vision_ran_{false};
+  std::unique_ptr<OrtValue> image_features_value_;
+  std::vector<float> image_features_buffer_;  // backing storage for OrtValue
+  size_t image_embed_consumed_{0};            // Track how many vision embeddings we've injected
+};
+
+}  // namespace Generators
diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp
new file mode 100644
index 0000000000..ca267f00e5
--- /dev/null
+++ b/src/models/qwen_vl_vision.cpp
@@ -0,0 +1,295 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+// Qwen VL Vision pipeline implementation with optional QNN EP for vision attention stage.
+
+#include "qwen_vl_vision.h"
+#include "../generators.h"
+
+#include <fstream>
+#include <stdexcept>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+
+namespace Generators {
+
+QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env,
+                                       const std::string& patch_embed_model,
+                                       const std::string& vision_attn_model,
+                                       const std::string& patch_merger_model,
+                                       int64_t spatial_merge_size,
+                                       bool use_qnn_attn,
+                                       const std::string& qnn_backend_path,
+                                       int64_t patch_size,
+                                       int64_t window_size)
+    // Match declaration order to avoid MSVC C5038 warning-as-error
+    : use_qnn_attn_(use_qnn_attn),
+      qnn_backend_path_(qnn_backend_path),
+      spatial_merge_size_(spatial_merge_size),
+      patch_size_(patch_size),
+      window_size_(window_size),
+      env_(env) {
+  // Convert std::string model paths to ORTCHAR_T for cross-platform (char or wchar_t)
+  auto toOrtPath = [](const std::string& s) -> std::basic_string<ORTCHAR_T> {
+    return std::basic_string<ORTCHAR_T>(s.begin(), s.end());
+  };
+  auto pe_path = toOrtPath(patch_embed_model);
+  auto attn_path = toOrtPath(vision_attn_model);
+  auto merger_path = toOrtPath(patch_merger_model);
+
+  // Patch embed and patch merger sessions (CPU for now)
+  patch_embed_session_ = OrtSession::Create(env_, pe_path.c_str(), nullptr);
+  patch_merger_session_ = OrtSession::Create(env_, merger_path.c_str(), nullptr);
+
+  if (use_qnn_attn_) {
+    // Ensure QNN provider is available
+    auto so = OrtSessionOptions::Create();
+
+    so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1);
+
+    // QNN provider options
+    std::unordered_map<std::string, std::string> qnn_options = {
+        {"backend_path", qnn_backend_path_},
+        {"htp_performance_mode", "burst"},
+        {"htp_graph_finalization_optimization_mode", "3"},
+        {"soc_model", "60"}};
+
+    auto providers = Ort::GetAvailableProviders();
+    bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end();
+    if (has_qnn) {
+      const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"};
+      const char* values[] = {qnn_backend_path_.c_str(), "burst", "3", "60"};
+      so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4);
+    } else {
+      // Use registered QNN EP - use GenAI wrapper APIs
+      auto ep_devices = GetOrtEnv().GetEpDevices();
+      std::vector<const OrtEpDevice*> qnn_devices;
+      qnn_devices.reserve(ep_devices.size());
+
+      for (const auto* device : ep_devices) {
+        if (device->Name() == "QNNExecutionProvider") {
+          qnn_devices.push_back(device);
+        }
+      }
+
+      if (qnn_devices.empty()) {
+        throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered.");
+      }
+      so->AppendExecutionProvider_V2(GetOrtEnv(), qnn_devices, qnn_options);
+    }
+
+    vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), so.get());
+  } else {
+    vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), nullptr);
+  }
+}
+
+std::unique_ptr<OrtValue> QwenVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const {
+  auto memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+  std::span<float> data_span(const_cast<float*>(data), count);
+  std::span<const int64_t> shape_span(shape.data(), shape.size());
+  return OrtValue::CreateTensor<float>(*memory_info, data_span, shape_span);
+}
+
+// Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store).
+
+std::vector<float> QwenVisionPipeline::Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape,
+                                           const std::vector<int64_t>& grid_thw) {
+  if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) {
+    throw std::runtime_error("Vision pipeline sessions not initialized");
+  }
+
+  // Calculate window indices dynamically if grid_thw provided
+  if (!grid_thw.empty() && grid_thw.size() == 3) {
+    wnd_idx_ = CalculateWindowIndex(grid_thw[0], grid_thw[1], grid_thw[2]);
+
+    // Build reverse index (argsort)
+    rev_idx_.resize(wnd_idx_.size());
+    std::vector<std::pair<int64_t, size_t>> pairs;
+    pairs.reserve(wnd_idx_.size());
+    for (size_t i = 0; i < wnd_idx_.size(); ++i) pairs.emplace_back(wnd_idx_[i], i);
+    std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b) { return a.first < b.first; });
+    for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast<int64_t>(pairs[i].second);
+  }
+
+  size_t pixel_count = 1;
+  for (auto d : pixel_shape) pixel_count *= static_cast<size_t>(d);
+  auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape);
+
+  auto pe_in_name = patch_embed_session_->GetInputName(0);
+  const char* pe_input_names[] = {pe_in_name.c_str()};
+  OrtValue* pe_inputs[] = {pixel_tensor.get()};
+
+  const int64_t num_patches = pixel_shape[1];
+  const int64_t hidden_dim = 1280;
+  std::vector<int64_t> pe_out_shape{num_patches, hidden_dim};
+  pe_out_buf_.resize(num_patches * hidden_dim);
+  auto pe_out_tensor = CreateTensor(pe_out_buf_.data(), pe_out_buf_.size(), pe_out_shape);
+
+  auto pe_out_name = patch_embed_session_->GetOutputName(0);
+  const char* pe_output_names[] = {pe_out_name.c_str()};
+  OrtValue* pe_outputs[] = {pe_out_tensor.get()};
+
+  patch_embed_session_->Run(nullptr, pe_input_names, pe_inputs, 1, pe_output_names, pe_outputs, 1);
+
+  const int64_t seq_len = num_patches;
+  const int64_t window_area = spatial_merge_size_ * spatial_merge_size_;
+  const int64_t num_windows = seq_len / window_area;
+
+  // Apply window reordering if indices available
+  reordered_buf_.resize(seq_len * hidden_dim);
+
+  if (!wnd_idx_.empty()) {
+    // Validate window configuration
+    if (seq_len % window_area != 0 || static_cast<int64_t>(wnd_idx_.size()) != num_windows) {
+      throw std::runtime_error("Invalid window configuration for vision pipeline");
+    }
+
+    // Apply window reordering
+    for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
+      int64_t src_w = wnd_idx_[dst_w];
+      if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range");
+      size_t offset_size = window_area * hidden_dim;
+      std::memcpy(reordered_buf_.data() + dst_w * offset_size,
+                  pe_out_buf_.data() + src_w * offset_size,
+                  offset_size * sizeof(float));
+    }
+  } else {
+    // No window reordering - use sequential order
+    std::memcpy(reordered_buf_.data(), pe_out_buf_.data(), seq_len * hidden_dim * sizeof(float));
+  }
+
+  // Check if vision_attn session expects a different sequence length (fixed shape model)
+  auto attn_input_info = vision_attn_session_->GetInputTypeInfo(0);
+  auto& attn_input_tensor_info = attn_input_info->GetTensorTypeAndShapeInfo();
+  auto attn_expected_shape = attn_input_tensor_info.GetShape();
+
+  int64_t expected_seq_len = (attn_expected_shape.size() >= 2 && attn_expected_shape[0] > 0) ? attn_expected_shape[0] : seq_len;
+  int64_t actual_seq_len = seq_len;  // Mutable copy for padding adjustments
+
+  if (expected_seq_len != seq_len) {
+    // Model expects fixed sequence length - need to pad or error
+    if (expected_seq_len > seq_len) {
+      // Pad the reordered buffer with zeros to match model's expected size
+      reordered_buf_.resize(expected_seq_len * hidden_dim, 0.0f);
+      actual_seq_len = expected_seq_len;  // Update actual_seq_len for subsequent operations
+    } else {
+      // Model expects smaller input - this is an error (image too large for fixed-shape model)
+      throw std::runtime_error("Vision attention model input size mismatch");
+    }
+  }
+
+  std::vector<int64_t> attn_shape{actual_seq_len, hidden_dim};
+  auto attn_in_tensor = CreateTensor(reordered_buf_.data(), reordered_buf_.size(), attn_shape);
+  auto attn_in_name = vision_attn_session_->GetInputName(0);
+  const char* attn_input_names[] = {attn_in_name.c_str()};
+  OrtValue* attn_inputs[] = {attn_in_tensor.get()};
+
+  attn_out_buf_.resize(actual_seq_len * hidden_dim);
+  auto attn_out_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
+  auto attn_out_name = vision_attn_session_->GetOutputName(0);
+  const char* attn_output_names[] = {attn_out_name.c_str()};
+  OrtValue* attn_outputs[] = {attn_out_tensor.get()};
+
+  vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1);
+
+  auto merger_in_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape);
+  auto merger_in_name = patch_merger_session_->GetInputName(0);
+  const char* merger_input_names[] = {merger_in_name.c_str()};
+  OrtValue* merger_inputs[] = {merger_in_tensor.get()};
+
+  const int64_t merged_seq_len = actual_seq_len / window_area;  // One token per window after merging
+  const int64_t merged_hidden = 3584;
+  std::vector<int64_t> merger_shape{merged_seq_len, merged_hidden};
+  merger_out_buf_.resize(merged_seq_len * merged_hidden);
+  auto merger_out_tensor = CreateTensor(merger_out_buf_.data(), merger_out_buf_.size(), merger_shape);
+  auto merger_out_name = patch_merger_session_->GetOutputName(0);
+  const char* merger_output_names[] = {merger_out_name.c_str()};
+  OrtValue* merger_outputs[] = {merger_out_tensor.get()};
+
+  patch_merger_session_->Run(nullptr, merger_input_names, merger_inputs, 1, merger_output_names, merger_outputs, 1);
+
+  final_embeddings_buf_.resize(merger_out_buf_.size());
+
+  if (!rev_idx_.empty()) {
+    // Apply reverse reordering
+    if (static_cast<int64_t>(rev_idx_.size()) != num_windows) {
+      throw std::runtime_error("Vision pipeline reverse index size mismatch");
+    }
+    for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) {
+      std::memcpy(final_embeddings_buf_.data() + dst_w * merged_hidden,
+                  merger_out_buf_.data() + rev_idx_[dst_w] * merged_hidden,
+                  merged_hidden * sizeof(float));
+    }
+  } else {
+    // No reverse reordering - use sequential order
+    std::memcpy(final_embeddings_buf_.data(), merger_out_buf_.data(),
+                merger_out_buf_.size() * sizeof(float));
+  }
+
+  last_seq_len_ = merged_seq_len;
+  last_hidden_size_ = merged_hidden;
+  return final_embeddings_buf_;
+}
+
+// Calculate window indices dynamically based on grid dimensions
+// Matches HuggingFace transformers implementation:
+// https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L367
+std::vector<int64_t> QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w) {
+  // Calculate LLM grid dimensions after spatial merging
+  int64_t llm_grid_h = grid_h / spatial_merge_size_;
+  int64_t llm_grid_w = grid_w / spatial_merge_size_;
+
+  // Calculate window size at the merged resolution
+  int64_t vit_merger_window_size = window_size_ / spatial_merge_size_ / patch_size_;
+
+  // Calculate padding needed to fit into windows
+  int64_t pad_h = (vit_merger_window_size - (llm_grid_h % vit_merger_window_size)) % vit_merger_window_size;
+  int64_t pad_w = (vit_merger_window_size - (llm_grid_w % vit_merger_window_size)) % vit_merger_window_size;
+
+  int64_t num_windows_h = (llm_grid_h + pad_h) / vit_merger_window_size;
+  int64_t num_windows_w = (llm_grid_w + pad_w) / vit_merger_window_size;
+
+  std::vector<int64_t> window_index;
+  window_index.reserve(grid_t * llm_grid_h * llm_grid_w);
+
+  // Create initial index grid
+  std::vector<int64_t> index(grid_t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w), -100);
+
+  // Fill non-padded positions with sequential indices
+  for (int64_t t = 0; t < grid_t; ++t) {
+    for (int64_t h = 0; h < llm_grid_h; ++h) {
+      for (int64_t w = 0; w < llm_grid_w; ++w) {
+        int64_t idx = t * llm_grid_h * llm_grid_w + h * llm_grid_w + w;
+        int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w;
+        index[padded_idx] = idx;
+      }
+    }
+  }
+
+  // Reshape into windows: (grid_t, num_windows_h, window_size, num_windows_w, window_size)
+  // Then permute to (grid_t, num_windows_h, num_windows_w, window_size, window_size)
+  // This groups patches by window instead of by spatial position
+  for (int64_t t = 0; t < grid_t; ++t) {
+    for (int64_t wh = 0; wh < num_windows_h; ++wh) {
+      for (int64_t ww = 0; ww < num_windows_w; ++ww) {
+        for (int64_t ph = 0; ph < vit_merger_window_size; ++ph) {
+          for (int64_t pw = 0; pw < vit_merger_window_size; ++pw) {
+            int64_t h = wh * vit_merger_window_size + ph;
+            int64_t w = ww * vit_merger_window_size + pw;
+            int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w;
+
+            // Only add non-padded indices
+            if (index[padded_idx] != -100) {
+              window_index.push_back(index[padded_idx]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return window_index;
+}
+
+}  // namespace Generators
diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h
new file mode 100644
index 0000000000..5db7c17e95
--- /dev/null
+++ b/src/models/qwen_vl_vision.h
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstdint>
+
+#include "onnxruntime_api.h"
+
+namespace Generators {
+
+// Internal vision pipeline (no external DLL interface required after Python binding removal).
+struct QwenVisionPipeline {
+  QwenVisionPipeline(OrtEnv& env,
+                     const std::string& patch_embed_model,
+                     const std::string& vision_attn_model,
+                     const std::string& patch_merger_model,
+                     int64_t spatial_merge_size,
+                     bool use_qnn_attn = false,
+                     const std::string& qnn_backend_path = "QnnHtp.dll",
+                     int64_t patch_size = 14,
+                     int64_t window_size = 56);
+  bool use_qnn_attn_{};
+  std::string qnn_backend_path_{};
+
+  QwenVisionPipeline(const QwenVisionPipeline&) = delete;
+  QwenVisionPipeline& operator=(const QwenVisionPipeline&) = delete;
+
+  // Run vision pipeline.
+  // pixel_values: float32 tensor with shape [S, C] or [B, C, H, W] depending on export (caller provides shape).
+  // grid_thw: optional grid dimensions [temporal, height, width] for dynamic window indexing
+  // The ONNX model is assumed to accept the provided shape directly as 'pixel_values'.
+  // Returns final merged embeddings (shape: [num_image_tokens, hidden_size]).
+  std::vector<float> Run(const float* pixel_data, const std::vector<int64_t>& pixel_shape,
+                         const std::vector<int64_t>& grid_thw = {});
+
+  // Shape info from last Run (seq_len, hidden_size). Returns empty vector if Run not called yet.
+  std::vector<int64_t> GetLastOutputShape() const {
+    if (last_seq_len_ <= 0 || last_hidden_size_ <= 0) return {};
+    return {last_seq_len_, last_hidden_size_};
+  }
+
+ private:
+  // Internal helpers
+  std::unique_ptr<OrtValue> CreateTensor(const float* data, size_t count, const std::vector<int64_t>& shape) const;
+
+  // Calculate window indices dynamically based on grid dimensions
+  // Returns window_index (reordering indices for windowing)
+  std::vector<int64_t> CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w);
+
+  std::unique_ptr<OrtSession> patch_embed_session_;
+  std::unique_ptr<OrtSession> vision_attn_session_;
+  std::unique_ptr<OrtSession> patch_merger_session_;
+
+  std::vector<int64_t> wnd_idx_;  // window reordering indices (computed dynamically)
+  std::vector<int64_t> rev_idx_;  // reverse ordering indices (argsort of wnd_idx)
+  int64_t spatial_merge_size_{};
+  int64_t patch_size_{14};   // Vision patch size (typically 14)
+  int64_t window_size_{56};  // Window size for attention (typically 56)
+  OrtEnv& env_;
+  int64_t last_seq_len_{0};
+  int64_t last_hidden_size_{0};
+
+  // Reusable buffers to avoid repeated allocation/deallocation
+  mutable std::vector<float> pe_out_buf_;
+  mutable std::vector<float> reordered_buf_;
+  mutable std::vector<float> attn_out_buf_;
+  mutable std::vector<float> merger_out_buf_;
+  mutable std::vector<float> final_embeddings_buf_;
+};
+
+}  // namespace Generators