From 2dbe74192790e8f5f30ca7fcc4c6182504f3c786 Mon Sep 17 00:00:00 2001
From: apsonawane <asonawane@microsoft.com>
Date: Thu, 13 Nov 2025 02:25:43 +0000
Subject: [PATCH 1/7] get_rope_index changes for qwen model

---
 src/models/model_type.h        |   7 +-
 src/models/multi_modal.cpp     |   6 +-
 src/models/multi_modal.h       |   2 +-
 src/models/position_inputs.cpp | 226 +++++++++++++++++++++++++++++++++
 src/models/position_inputs.h   |  49 +++++++
 5 files changed, 285 insertions(+), 5 deletions(-)
diff --git a/src/models/model_type.h b/src/models/model_type.h
index c7c4d2f691..77d8cba665 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,10 +18,15 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 2> VLM = {"gemma3", "phi3v"};
+    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "qwen2vl"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 
+  inline static bool IsQwen2VL(const std::string& model_type) {
+    // Qwen2-VL specific check for 3D position IDs
+    return model_type == "qwen2vl";
+  }
+
   inline static bool IsALM(const std::string& model_type) {
     // Audio-language model (ALM)
     static constexpr std::array<std::string_view, 1> ALM = {"whisper"};
diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp
index 56e55b1552..48ad8d5fcf 100644
--- a/src/models/multi_modal.cpp
+++ b/src/models/multi_modal.cpp
@@ -181,9 +181,9 @@ DeviceSpan<float> EmbeddingState::Run(int current_length, DeviceSpan<int32_t>& n
 DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params)
     : State{params, model},
       model_{model},
-      position_inputs_{model, *this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask} {
+      position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} {
   inputs_embeds_.Add();
-  position_inputs_.Add();
+  position_inputs_->Add();
   logits_.Add();
   kv_cache_.Add();
 }
@@ -201,7 +201,7 @@ DeviceSpan<float> DecoderState::Run(int current_length, DeviceSpan<int32_t>& nex
 void DecoderState::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int total_length, DeviceSpan<int32_t> beam_indices) {
   int batch_size = static_cast<int>(inputs_embeds_.GetShape()[0]);
   size_t new_length = next_tokens.size() / batch_size;
-  position_inputs_.Update(next_tokens, total_length, static_cast<int>(new_length));
+  position_inputs_->Update(next_tokens, total_length, static_cast<int>(new_length));
   kv_cache_.Update(beam_indices, total_length);
   logits_.Update(next_tokens, new_length);
   inputs_embeds_.UpdateSequenceLength(new_length);
diff --git a/src/models/multi_modal.h b/src/models/multi_modal.h
index 206fc3850b..0cbe4e527b 100644
--- a/src/models/multi_modal.h
+++ b/src/models/multi_modal.h
@@ -105,7 +105,7 @@ struct DecoderState : State {
   const MultiModalLanguageModel& model_;
   Embeddings inputs_embeds_{*this, Embeddings::Mode::Input,  // Model input
                             model_.config_->model.decoder.inputs.embeddings};
-  DefaultPositionInputs position_inputs_;  // Model input
+  std::unique_ptr<PositionInputs> position_inputs_;  // Model input
   DefaultKeyValueCache kv_cache_{*this};   // Model input
   Logits logits_{*this};                   // Model output
 };
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index d87a8c6b64..daccf9932d 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -1,6 +1,7 @@
 #include "../generators.h"
 #include "model.h"
 #include "position_inputs.h"
+#include "model_type.h"
 
 namespace Generators {
 
@@ -477,7 +478,232 @@ void WindowedPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_l
   window_index_++;
 }
 
+// Qwen2VLPositionInputs implementation
+Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan<int32_t> sequence_lengths_unk)
+    : model_{model},
+      state_{state} {
+  has_mask_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.attention_mask);
+  has_posid_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.position_ids);
+
+  type_ = Ort::TypeToTensorType<int32_t>;
+  if (has_mask_input_) {
+    type_ = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.attention_mask);
+  }
+  if (has_posid_input_) {
+    if (has_mask_input_) {
+      if (model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids) != type_) {
+        throw std::runtime_error("position_ids & attention_mask must have the same data type");
+      }
+    }
+    // Set up 3D position IDs shape: [4, batch_size, sequence_length]
+    position_ids_shape_[0] = 4;  // 4 dimensions: text + 3D vision (temporal, height, width)
+    position_ids_shape_[1] = state_.params_->search.batch_size;
+    position_ids_shape_[2] = 0;  // Will be set during first update
+
+    position_ids_ = std::make_unique<Tensor>();
+    position_ids_next_ = std::make_unique<Tensor>();
+  }
+  if (has_mask_input_) {
+    attention_mask_shape_[0] = state_.params_->search.batch_size;
+    attention_mask_shape_[1] = 0;  // Will be set during first update
+    attention_mask_ = std::make_unique<Tensor>();
+    attention_mask_next_ = std::make_unique<Tensor>();
+  }
+}
+
+void Qwen2VLPositionInputs::Add() {
+  if (has_posid_input_) {
+    AddPositionIDs();
+  }
+  if (has_mask_input_) {
+    AddAttentionMask();
+  }
+}
+
+void Qwen2VLPositionInputs::AddPositionIDs() {
+  posid_input_index_ = state_.inputs_.size();
+  state_.inputs_.push_back(position_ids_->GetOrtTensor());
+  state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str());
+}
+
+void Qwen2VLPositionInputs::AddAttentionMask() {
+  mask_input_index_ = state_.inputs_.size();
+  state_.inputs_.push_back(attention_mask_->GetOrtTensor());
+  state_.input_names_.push_back(model_.config_->model.decoder.inputs.attention_mask.c_str());
+}
+
+template <typename T>
+void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape) {
+  // For Qwen2-VL, in the prefill stage, position_ids are [4, batch_size, seq_len]
+  // During generation, they remain [4, batch_size, 1]
+  // The 4 dimensions are: [text_positions, temporal_positions, height_positions, width_positions]
+  
+  auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
+  auto* position_data = position_ids->GetTensorMutableData<T>();
+  
+  auto position_ids_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array<int64_t, 3>{shape[0], shape[1], 1}, type_);
+  auto* position_data_next = position_ids_next->GetTensorMutableData<T>();
+
+  // Initialize position IDs
+  // For text-only content (no vision), all 4 dimensions have the same position values
+  // This matches the behavior in transformers where text positions are replicated across dimensions
+  if (shape[1] == 1) {
+    // Single batch, simple case
+    for (int64_t dim = 0; dim < 4; ++dim) {
+      for (int64_t i = 0; i < shape[2]; ++i) {
+        position_data[dim * shape[1] * shape[2] + i] = static_cast<T>(i);
+      }
+    }
+    // Initialize next tensor with the last position + 1
+    for (int64_t dim = 0; dim < 4; ++dim) {
+      position_data_next[dim * shape[1] + 0] = static_cast<T>(shape[2]);
+    }
+  } else {
+    // Multiple batches - initialize with simple ascending values
+    // In practice, vision-specific positions would be computed by the model's get_rope_index logic
+    for (int64_t dim = 0; dim < 4; ++dim) {
+      for (int64_t batch = 0; batch < shape[1]; ++batch) {
+        for (int64_t pos = 0; pos < shape[2]; ++pos) {
+          position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast<T>(pos);
+        }
+        position_data_next[dim * shape[1] + batch] = static_cast<T>(shape[2]);
+      }
+    }
+  }
+
+  // Move tensors to appropriate device and expand by num_beams
+  position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, state_.params_->search.num_beams);
+  position_ids_next_->ort_tensor_ = model_.ExpandInputs(position_ids_next, state_.params_->search.num_beams);
+  if (state_.params_->use_graph_capture)
+    position_ids_next_->MakeStatic();
+  position_ids_shape_[1] *= state_.params_->search.num_beams;
+  state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
+}
+
+template <typename T>
+void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 2> shape) {
+  // Standard 2D attention mask initialization
+  auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
+  auto* mask_data = attention_mask->GetTensorMutableData<T>();
+  
+  auto attention_mask_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array<int64_t, 2>{shape[0], shape[1] + 1}, type_);
+  auto* mask_data_next = attention_mask_next->GetTensorMutableData<T>();
+
+  // Set mask to 1 for all positions (assuming no padding in first iteration)
+  std::fill_n(mask_data, shape[0] * shape[1], static_cast<T>(1));
+  std::fill_n(mask_data_next, shape[0] * (shape[1] + 1), static_cast<T>(1));
+
+  // Move tensors to device and expand by num_beams
+  attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams);
+  attention_mask_next_->ort_tensor_ = model_.ExpandInputs(attention_mask_next, state_.params_->search.num_beams);
+  if (state_.params_->use_graph_capture)
+    attention_mask_next_->MakeStatic();
+  attention_mask_shape_[0] *= state_.params_->search.num_beams;
+  state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor();
+}
+
+void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length) {
+  // After first update, we use the cached position_ids_next tensor
+  if (position_ids_next_ && position_ids_shape_[1] > 1 && position_ids_shape_[2] == 1) {
+    position_ids_ = std::move(position_ids_next_);
+    position_ids_next_ = nullptr;
+  } else {
+    position_ids_->CreateTensor(position_ids_shape_, state_.params_->use_graph_capture && position_ids_shape_[2] == 1);
+  }
+
+  // Update position values for generation phase
+  // During generation, we increment all 4 dimensions uniformly for text generation
+  if (type_ == Ort::TypeToTensorType<int32_t>) {
+    auto* data = position_ids_->GetTensorMutableData<int32_t>();
+    for (int64_t dim = 0; dim < 4; ++dim) {
+      for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
+        for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
+          data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
+            static_cast<int32_t>(total_length - new_length + pos);
+        }
+      }
+    }
+  } else {
+    auto* data = position_ids_->GetTensorMutableData<int64_t>();
+    for (int64_t dim = 0; dim < 4; ++dim) {
+      for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
+        for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
+          data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
+            static_cast<int64_t>(total_length - new_length + pos);
+        }
+      }
+    }
+  }
+
+  state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
+}
+
+void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length) {
+  if (attention_mask_next_ && attention_mask_shape_[1] == total_length - 1) {
+    attention_mask_ = std::move(attention_mask_next_);
+    attention_mask_next_ = nullptr;
+  } else {
+    attention_mask_->CreateTensor(attention_mask_shape_, state_.params_->use_graph_capture && attention_mask_shape_[1] == 1);
+  }
+
+  if (!state_.params_->use_graph_capture || attention_mask_shape_[1] != 1) {
+    // Update attention mask - typically all 1s during generation
+    if (type_ == Ort::TypeToTensorType<int32_t>) {
+      auto* mask_data = attention_mask_->GetTensorMutableData<int32_t>();
+      std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int32_t>(1));
+    } else {
+      auto* mask_data = attention_mask_->GetTensorMutableData<int64_t>();
+      std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int64_t>(1));
+    }
+  }
+
+  state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor();
+}
+
+void Qwen2VLPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_length, int new_length) {
+  if (has_posid_input_) {
+    if (is_first_update_) {
+      position_ids_shape_[2] = new_length;
+      if (type_ == Ort::TypeToTensorType<int32_t>)
+        CreateAndInitialize3DPositionIDs<int32_t>(next_tokens, position_ids_shape_);
+      else
+        CreateAndInitialize3DPositionIDs<int64_t>(next_tokens, position_ids_shape_);
+    } else {
+      Update3DPositionIDs(total_length, new_length);
+    }
+  }
+
+  if (has_mask_input_) {
+    if (is_first_update_) {
+      attention_mask_shape_[1] = new_length;
+      if (type_ == Ort::TypeToTensorType<int32_t>)
+        CreateAndInitializeAttentionMask<int32_t>(next_tokens, attention_mask_shape_);
+      else
+        CreateAndInitializeAttentionMask<int64_t>(next_tokens, attention_mask_shape_);
+    } else {
+      UpdateAttentionMask(total_length, new_length);
+    }
+  }
+  
+  is_first_update_ = false;
+}
+
+void Qwen2VLPositionInputs::RewindTo(size_t index) {
+  // For Qwen2-VL, we need to handle rewinding for beam search
+  if (has_posid_input_) {
+    position_ids_shape_[2] = static_cast<int64_t>(index);
+  }
+  if (has_mask_input_) {
+    attention_mask_shape_[1] = static_cast<int64_t>(index);
+  }
+}
+
 std::unique_ptr<PositionInputs> CreatePositionInputs(State& state, DeviceSpan<int32_t> sequence_lengths, const std::string& attention_mask_name) {
+  // Check for Qwen2-VL model type which requires 3D position IDs
+  if (ModelType::IsQwen2VL(state.model_.config_->model.type)) {
+    return std::make_unique<Qwen2VLPositionInputs>(state.model_, state, sequence_lengths);
+  }
+  
   if (state.model_.config_->model.decoder.sliding_window.has_value() && state.model_.config_->model.decoder.sliding_window->slide_inputs) {
     return std::make_unique<WindowedPositionInputs>(state);
   } else {
diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
index 5095bf40fc..644f0f3a63 100644
--- a/src/models/position_inputs.h
+++ b/src/models/position_inputs.h
@@ -109,6 +109,55 @@ struct WindowedPositionInputs : PositionInputs {
   size_t window_index_{};
 };
 
+// Qwen2-VL uses 3D rotary position embeddings for multimodal (vision + text) content.
+// Position IDs have shape [4, batch_size, seq_len] where:
+//   - Dimension 0: Text-only positions
+//   - Dimensions 1-3: Vision positions (temporal, height, width)
+// This class manages rope_deltas caching to maintain correct positional encoding across generation steps.
+struct Qwen2VLPositionInputs : PositionInputs {
+  Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan<int32_t> sequence_lengths_unk);
+  Qwen2VLPositionInputs(const Qwen2VLPositionInputs&) = delete;
+  Qwen2VLPositionInputs& operator=(const Qwen2VLPositionInputs&) = delete;
+
+  void Add() override;
+  void Update(DeviceSpan<int32_t> next_tokens, int total_length, int new_length) override;
+  void RewindTo(size_t index) override;
+
+ private:
+  void AddPositionIDs();
+  void AddAttentionMask();
+  
+  template <typename T>
+  void CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape);
+  void Update3DPositionIDs(int total_length, int new_length);
+  
+  template <typename T>
+  void CreateAndInitializeAttentionMask(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 2> shape);
+  void UpdateAttentionMask(int total_length, int new_length);
+
+  const Model& model_;
+  State& state_;
+
+  size_t mask_input_index_{~0U};
+  size_t posid_input_index_{~0U};
+
+  ONNXTensorElementDataType type_;  // Common type for position_ids and attention_mask
+
+  bool has_mask_input_{false};
+  bool has_posid_input_{false};
+
+  std::array<int64_t, 3> position_ids_shape_{};  // {4, batch_size, sequence_length} for 3D positions
+  std::unique_ptr<Tensor> position_ids_;
+  std::unique_ptr<Tensor> position_ids_next_;  // Replaces position_ids_ after the first Run() call
+  
+  std::array<int64_t, 2> attention_mask_shape_{};  // {batch_size, sequence_length}
+  std::unique_ptr<Tensor> attention_mask_;
+  std::unique_ptr<Tensor> attention_mask_next_;  // Replaces attention_mask_ after each run
+  
+  std::unique_ptr<Tensor> rope_deltas_;  // Cached rope deltas for position calculation
+  bool is_first_update_{true};
+};
+
 std::unique_ptr<PositionInputs> CreatePositionInputs(State& state, DeviceSpan<int32_t> sequence_lengths, const std::string& attention_mask_name);
 
 }  // namespace Generators

From be5efbb98f8a10e6b255dfc9cd7aaca1fa2fcdeb Mon Sep 17 00:00:00 2001
From: apsonawane <asonawane@microsoft.com>
Date: Thu, 13 Nov 2025 23:46:01 +0000
Subject: [PATCH 2/7] Add text only support

---
 src/config.cpp                      |   2 +
 src/config.h                        |   2 +
 src/models/model.cpp                |   4 +-
 src/models/model.h                  |   1 +
 src/models/model_type.h             |   4 +-
 src/models/position_inputs.cpp      |  78 +++++++-----
 src/models/qwen_image_processor.cpp | 189 ++++++++++++++++++++++++++++
 src/models/qwen_image_processor.h   |  20 +++
 8 files changed, 263 insertions(+), 37 deletions(-)
 create mode 100644 src/models/qwen_image_processor.cpp
 create mode 100644 src/models/qwen_image_processor.h

diff --git a/src/config.cpp b/src/config.cpp
index e17e9c21f2..9a9d48ab9b 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -615,6 +615,8 @@ struct VisionInputs_Element : JSON::Element {
       v_.pixel_values = JSON::Get<std::string_view>(value);
     } else if (name == "image_sizes") {
       v_.image_sizes = JSON::Get<std::string_view>(value);
+    } else if (name == "image_grid_thw") {
+      v_.image_grid_thw = JSON::Get<std::string_view>(value);
     } else if (name == "attention_mask") {
       v_.attention_mask = JSON::Get<std::string_view>(value);
     } else {
diff --git a/src/config.h b/src/config.h
index 507d7c80c1..76b3d4c241 100644
--- a/src/config.h
+++ b/src/config.h
@@ -38,6 +38,7 @@ struct Config {
     // Vision encoder names
     static constexpr std::string_view PixelValuesName = "pixel_values";
     static constexpr std::string_view ImageSizesName = "image_sizes";
+    static constexpr std::string_view ImageGridThwName = "image_grid_thw";
     static constexpr std::string_view ImageAttentionMaskName = "image_attention_mask";
     static constexpr std::string_view ImageFeaturesName = "image_features";
     static constexpr std::string_view NumImageTokens = "num_image_tokens";
@@ -162,6 +163,7 @@ struct Config {
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
+        std::string image_grid_thw{Defaults::ImageGridThwName};
         std::string attention_mask{Defaults::ImageAttentionMaskName};  // image attention mask
       } inputs;
 
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 92206aff3b..97e5e7cc5e 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -1286,7 +1286,9 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
           {"phi3v", Processor::Create<PhiImageProcessor>},
           {"whisper", Processor::Create<WhisperProcessor>},
           {"phi4mm", Processor::Create<PhiMultiModalProcessor>},
-          {"gemma3", Processor::Create<GemmaImageProcessor>}} {
+          {"gemma3", Processor::Create<GemmaImageProcessor>},
+          {"qwen2vl", Processor::Create<QwenImageProcessor>},
+          {"qwen2_5_vl", Processor::Create<QwenImageProcessor>}} {
   auto processor = processor_factory_.find(config.model.type);
   if (processor != processor_factory_.end()) {
     processor_ = processor->second(config, session_info);
diff --git a/src/models/model.h b/src/models/model.h
index 7faa1000fe..0e50059702 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -9,6 +9,7 @@
 #include "whisper_processor.h"
 #include "phi_multimodal_processor.h"
 #include "gemma_image_processor.h"
+#include "qwen_image_processor.h"
 #include "adapters.h"
 #include "extra_outputs.h"
 
diff --git a/src/models/model_type.h b/src/models/model_type.h
index 77d8cba665..0094ab2a11 100644
--- a/src/models/model_type.h
+++ b/src/models/model_type.h
@@ -18,13 +18,13 @@ struct ModelType {
 
   inline static bool IsVLM(const std::string& model_type) {
     // Vision-language model (VLM)
-    static constexpr std::array<std::string_view, 3> VLM = {"gemma3", "phi3v", "qwen2vl"};
+    static constexpr std::array<std::string_view, 4> VLM = {"gemma3", "phi3v", "qwen2vl", "qwen2_5_vl"};
     return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
   }
 
   inline static bool IsQwen2VL(const std::string& model_type) {
     // Qwen2-VL specific check for 3D position IDs
-    return model_type == "qwen2vl";
+    return model_type == "qwen2vl" || model_type == "qwen2_5_vl";
   }
 
   inline static bool IsALM(const std::string& model_type) {
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index daccf9932d..336a51647a 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -485,29 +485,29 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D
   has_mask_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.attention_mask);
   has_posid_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.position_ids);
 
-  type_ = Ort::TypeToTensorType<int32_t>;
+  type_ = Ort::TypeToTensorType<int64_t>;  // Default to int64 for Qwen2VL
   if (has_mask_input_) {
     type_ = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.attention_mask);
   }
+  
+  ONNXTensorElementDataType posid_type = type_;
   if (has_posid_input_) {
-    if (has_mask_input_) {
-      if (model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids) != type_) {
-        throw std::runtime_error("position_ids & attention_mask must have the same data type");
-      }
-    }
-    // Set up 3D position IDs shape: [4, batch_size, sequence_length]
-    position_ids_shape_[0] = 4;  // 4 dimensions: text + 3D vision (temporal, height, width)
+    posid_type = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids);
+    
+    // Set up 3D position IDs shape: [3, batch_size, sequence_length]
+    // The 3 dimensions represent temporal, height, and width for mrope
+    position_ids_shape_[0] = 3;
     position_ids_shape_[1] = state_.params_->search.batch_size;
     position_ids_shape_[2] = 0;  // Will be set during first update
 
-    position_ids_ = std::make_unique<Tensor>();
-    position_ids_next_ = std::make_unique<Tensor>();
+    position_ids_ = std::make_unique<Tensor>(model_.p_device_inputs_, posid_type);
+    position_ids_next_ = std::make_unique<Tensor>(model_.p_device_inputs_, posid_type);
   }
   if (has_mask_input_) {
     attention_mask_shape_[0] = state_.params_->search.batch_size;
     attention_mask_shape_[1] = 0;  // Will be set during first update
-    attention_mask_ = std::make_unique<Tensor>();
-    attention_mask_next_ = std::make_unique<Tensor>();
+    attention_mask_ = std::make_unique<Tensor>(model_.p_device_inputs_, type_);
+    attention_mask_next_ = std::make_unique<Tensor>(model_.p_device_inputs_, type_);
   }
 }
 
@@ -534,9 +534,9 @@ void Qwen2VLPositionInputs::AddAttentionMask() {
 
 template <typename T>
 void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape) {
-  // For Qwen2-VL, in the prefill stage, position_ids are [4, batch_size, seq_len]
-  // During generation, they remain [4, batch_size, 1]
-  // The 4 dimensions are: [text_positions, temporal_positions, height_positions, width_positions]
+  // For Qwen2-VL, in the prefill stage, position_ids are [3, batch_size, seq_len]
+  // During generation, they remain [3, batch_size, 1]
+  // The 3 dimensions represent: [temporal, height, width] for mrope
   
   auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
   auto* position_data = position_ids->GetTensorMutableData<T>();
@@ -545,23 +545,33 @@ void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t>
   auto* position_data_next = position_ids_next->GetTensorMutableData<T>();
 
   // Initialize position IDs
-  // For text-only content (no vision), all 4 dimensions have the same position values
-  // This matches the behavior in transformers where text positions are replicated across dimensions
-  if (shape[1] == 1) {
-    // Single batch, simple case
-    for (int64_t dim = 0; dim < 4; ++dim) {
-      for (int64_t i = 0; i < shape[2]; ++i) {
-        position_data[dim * shape[1] * shape[2] + i] = static_cast<T>(i);
+  // For text-only content (no vision), all 3 dimensions have the same position values
+  // This matches the PyTorch get_rope_index behavior where text positions are [0,1,2,...]
+  // replicated across all 3 mrope dimensions
+  
+  // Fill position_ids: shape is [3, batch_size, seq_len]
+  for (int64_t dim = 0; dim < 3; ++dim) {
+    for (int64_t batch = 0; batch < shape[1]; ++batch) {
+      for (int64_t pos = 0; pos < shape[2]; ++pos) {
+        // All 3 dimensions get the same sequential position values for text
+        position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast<T>(pos);
       }
     }
-    // Initialize next tensor with the last position + 1
-    for (int64_t dim = 0; dim < 4; ++dim) {
-      position_data_next[dim * shape[1] + 0] = static_cast<T>(shape[2]);
+  }
+  
+  // Fill position_ids_next for generation: shape is [3, batch_size, 1]
+  for (int64_t dim = 0; dim < 3; ++dim) {
+    for (int64_t batch = 0; batch < shape[1]; ++batch) {
+      // Next position is seq_len (continuing from last position)
+      position_data_next[dim * shape[1] + batch] = static_cast<T>(shape[2]);
     }
-  } else {
+  }
+  
+  // Old multi-batch code removed since we simplified to match PyTorch logic
+  if (false) {
     // Multiple batches - initialize with simple ascending values
     // In practice, vision-specific positions would be computed by the model's get_rope_index logic
-    for (int64_t dim = 0; dim < 4; ++dim) {
+    for (int64_t dim = 0; dim < 3; ++dim) {
       for (int64_t batch = 0; batch < shape[1]; ++batch) {
         for (int64_t pos = 0; pos < shape[2]; ++pos) {
           position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast<T>(pos);
@@ -612,10 +622,10 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length
   }
 
   // Update position values for generation phase
-  // During generation, we increment all 4 dimensions uniformly for text generation
+  // During generation, we increment all 3 dimensions uniformly for text generation
   if (type_ == Ort::TypeToTensorType<int32_t>) {
-    auto* data = position_ids_->GetTensorMutableData<int32_t>();
-    for (int64_t dim = 0; dim < 4; ++dim) {
+    auto* data = position_ids_->GetMutableData<int32_t>();
+    for (int64_t dim = 0; dim < 3; ++dim) {
       for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
         for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
           data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
@@ -624,8 +634,8 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length
       }
     }
   } else {
-    auto* data = position_ids_->GetTensorMutableData<int64_t>();
-    for (int64_t dim = 0; dim < 4; ++dim) {
+    auto* data = position_ids_->GetMutableData<int64_t>();
+    for (int64_t dim = 0; dim < 3; ++dim) {
       for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
         for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
           data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
@@ -649,10 +659,10 @@ void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length
   if (!state_.params_->use_graph_capture || attention_mask_shape_[1] != 1) {
     // Update attention mask - typically all 1s during generation
     if (type_ == Ort::TypeToTensorType<int32_t>) {
-      auto* mask_data = attention_mask_->GetTensorMutableData<int32_t>();
+      auto* mask_data = attention_mask_->GetMutableData<int32_t>();
       std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int32_t>(1));
     } else {
-      auto* mask_data = attention_mask_->GetTensorMutableData<int64_t>();
+      auto* mask_data = attention_mask_->GetMutableData<int64_t>();
       std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int64_t>(1));
     }
   }
diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp
new file mode 100644
index 0000000000..75c46790b1
--- /dev/null
+++ b/src/models/qwen_image_processor.cpp
@@ -0,0 +1,189 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "../generators.h"
+#include "model.h"
+
+#include <regex>
+
+namespace Generators {
+
+namespace {
+
+std::tuple<std::unique_ptr<OrtValue>, std::unique_ptr<OrtValue>>
+ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt,
+                   OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, Ort::Allocator& allocator) {
+  constexpr char vision_start_token[] = "<|vision_start|>";
+  constexpr char vision_end_token[] = "<|vision_end|>";
+  constexpr char image_pad_token[] = "<|image_pad|>";
+
+  int64_t num_images = 0;
+  int64_t total_image_tokens = 0;
+  
+  if (pixel_values && image_grid_thw) {
+    const float* pixel_values_data{};
+    const int64_t* pixel_values_shape{};
+    size_t pixel_values_num_dims;
+    CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast<const void**>(&pixel_values_data),
+                                  &pixel_values_shape, &pixel_values_num_dims));
+    
+    const int64_t* image_grid_thw_data{};
+    const int64_t* image_grid_thw_shape{};
+    size_t image_grid_thw_num_dims;
+    CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast<const void**>(&image_grid_thw_data),
+                                  &image_grid_thw_shape, &image_grid_thw_num_dims));
+    
+    num_images = image_grid_thw_shape[0];
+    
+    // Calculate total image tokens based on grid dimensions
+    // For each image: (temporal * height * width) / (merge_size^2)
+    constexpr int64_t merge_size = 2;
+    for (int64_t i = 0; i < num_images; ++i) {
+      int64_t t = image_grid_thw_data[i * 3 + 0];
+      int64_t h = image_grid_thw_data[i * 3 + 1];
+      int64_t w = image_grid_thw_data[i * 3 + 2];
+      total_image_tokens += (t * h * w) / (merge_size * merge_size);
+    }
+  }
+
+  // Generate input_ids with vision tokens
+  std::string text = prompt;
+  
+  // If prompt is empty, add vision markers for each image
+  if (text.empty()) {
+    for (int64_t i = 0; i < num_images; ++i) {
+      text += std::string(vision_start_token) + " " + std::string(vision_end_token);
+      if (i < num_images - 1) {
+        text += " ";
+      }
+    }
+  }
+
+  // Count the number of vision_start tokens and make sure it matches the number of images
+  const std::regex vision_start_regex{std::string(vision_start_token)};
+  const auto vision_start_begin = std::sregex_iterator(text.begin(), text.end(), vision_start_regex);
+  const auto vision_start_end = std::sregex_iterator();
+  const auto vision_start_tokens = std::distance(vision_start_begin, vision_start_end);
+  
+  if (num_images != vision_start_tokens) {
+    throw std::runtime_error("Prompt contained " + std::to_string(vision_start_tokens) + 
+                           " vision_start tokens but received " + std::to_string(num_images) + " images.");
+  }
+
+  // For Qwen2-VL, we need to replace vision markers with image_pad tokens
+  // The number of image_pad tokens for each image depends on the image dimensions
+  if (num_images > 0 && image_grid_thw) {
+    const int64_t* image_grid_thw_data{};
+    const int64_t* image_grid_thw_shape{};
+    size_t image_grid_thw_num_dims;
+    CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast<const void**>(&image_grid_thw_data),
+                                  &image_grid_thw_shape, &image_grid_thw_num_dims));
+    
+    constexpr int64_t merge_size = 2;
+    std::string modified_text;
+    size_t last_pos = 0;
+    size_t image_idx = 0;
+    
+    std::smatch match;
+    std::string temp_text = text;
+    while (std::regex_search(temp_text, match, vision_start_regex)) {
+      // Add text before the vision_start token
+      modified_text += text.substr(last_pos, match.position() - (last_pos - (text.size() - temp_text.size())));
+      
+      // Calculate number of image_pad tokens for this image
+      int64_t t = image_grid_thw_data[image_idx * 3 + 0];
+      int64_t h = image_grid_thw_data[image_idx * 3 + 1];
+      int64_t w = image_grid_thw_data[image_idx * 3 + 2];
+      int64_t num_pads = (t * h * w) / (merge_size * merge_size);
+      
+      // Add vision_start, image_pad tokens, and vision_end
+      modified_text += vision_start_token;
+      for (int64_t i = 0; i < num_pads; ++i) {
+        modified_text += image_pad_token;
+      }
+      modified_text += vision_end_token;
+      
+      last_pos = match.position() + match.length() + (text.size() - temp_text.size());
+      
+      // Find and skip vision_end token
+      size_t vision_end_pos = text.find(vision_end_token, last_pos);
+      if (vision_end_pos != std::string::npos) {
+        last_pos = vision_end_pos + strlen(vision_end_token);
+      }
+      
+      temp_text = match.suffix();
+      image_idx++;
+    }
+    modified_text += text.substr(last_pos);
+    text = modified_text;
+  }
+
+  const std::vector<int32_t> input_ids = tokenizer.Encode(text.c_str());
+
+  std::unique_ptr<OrtValue> input_ids_value = OrtValue::CreateTensor<int32_t>(
+      allocator, std::vector<int64_t>{1, static_cast<int64_t>(input_ids.size())});
+  std::copy(input_ids.begin(), input_ids.end(), input_ids_value->GetTensorMutableData<int32_t>());
+
+  std::unique_ptr<OrtValue> num_img_tokens = OrtValue::CreateTensor<int64_t>(
+      allocator, std::vector<int64_t>{1});
+  num_img_tokens->GetTensorMutableData<int64_t>()[0] = total_image_tokens;
+
+  return {std::move(input_ids_value), std::move(num_img_tokens)};
+}
+
+}  // namespace
+
+QwenImageProcessor::QwenImageProcessor(Config& config, const SessionInfo& session_info)
+    : pixel_values_type_{session_info.GetInputDataType(config.model.vision.inputs.pixel_values)} {
+  const auto processor_config = (config.config_path / fs::path(config.model.vision.config_filename)).string();
+  CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str()));
+
+  config.AddMapping(std::string(Config::Defaults::InputIdsName), config.model.embedding.inputs.input_ids);
+  config.AddMapping(std::string(Config::Defaults::PixelValuesName), config.model.vision.inputs.pixel_values);
+}
+
+std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const {
+  std::string prompt = std::string(payload.prompt);
+  const Images* images = payload.images;
+  Ort::Allocator& allocator{Ort::Allocator::GetWithDefaultOptions()};
+  auto named_tensors = std::make_unique<NamedTensors>();
+
+  if (!images) {
+    [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, allocator);
+    named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared<Tensor>(std::move(input_ids)));
+    return named_tensors;
+  }
+
+  ort_extensions::OrtxObjectPtr<OrtxTensorResult> result;
+  CheckResult(OrtxImagePreProcess(processor_.get(), images->images_.get(), result.ToBeAssigned()));
+
+  OrtxTensor* pixel_values = nullptr;
+  CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values));
+
+  OrtxTensor* image_grid_thw = nullptr;
+  CheckResult(OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw));
+
+  auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, image_grid_thw, allocator);
+  named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared<Tensor>(std::move(input_ids)));
+
+  if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+    named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
+                           std::make_shared<Tensor>(ProcessTensor<float>(pixel_values, allocator)));
+  } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
+    named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
+                           std::make_shared<Tensor>(ProcessTensor<Ort::BFloat16_t>(pixel_values, allocator)));
+  } else {
+    named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
+                           std::make_shared<Tensor>(ProcessTensor<Ort::Float16_t>(pixel_values, allocator)));
+  }
+
+  // Add image_grid_thw tensor
+  named_tensors->emplace("image_grid_thw",
+                         std::make_shared<Tensor>(ProcessTensor<int64_t>(image_grid_thw, allocator)));
+
+  named_tensors->emplace(std::string(Config::Defaults::NumImageTokens), std::make_shared<Tensor>(std::move(num_img_tokens)));
+
+  return named_tensors;
+}
+
+}  // namespace Generators
diff --git a/src/models/qwen_image_processor.h b/src/models/qwen_image_processor.h
new file mode 100644
index 0000000000..a116a2c67c
--- /dev/null
+++ b/src/models/qwen_image_processor.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "processor.h"
+
+namespace Generators {
+
+struct QwenImageProcessor : Processor {
+  QwenImageProcessor(Config& config, const SessionInfo& session_info);
+
+  virtual std::unique_ptr<NamedTensors> Process(const Tokenizer& tokenizer, const Payload& payload) const override;
+
+ private:
+  ort_extensions::OrtxObjectPtr<OrtxProcessor> processor_;
+
+  ONNXTensorElementDataType pixel_values_type_;
+};
+
+}  // namespace Generators

From 8aceabd3cec0172ec1c343c9d8ce46246f17c6af Mon Sep 17 00:00:00 2001
From: apsonawane <asonawane@microsoft.com>
Date: Sat, 15 Nov 2025 01:40:50 +0000
Subject: [PATCH 3/7] Running pipeline

---
 cmake/deps.txt                      |   2 +-
 src/models/position_inputs.cpp      |  43 ++++---
 src/models/qwen_image_processor.cpp | 171 ++++++++++++++++++++++++----
 3 files changed, 168 insertions(+), 48 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 8e4f97ebb8..fc0530b187 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;97083215f9c84189ad2484d5c933cc06086e9073
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;301b442d8f903daba129e825cd446755b840abb0
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index 336a51647a..0787e34691 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -613,18 +613,13 @@ void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t>
 }
 
 void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length) {
-  // After first update, we use the cached position_ids_next tensor
-  if (position_ids_next_ && position_ids_shape_[1] > 1 && position_ids_shape_[2] == 1) {
-    position_ids_ = std::move(position_ids_next_);
-    position_ids_next_ = nullptr;
-  } else {
-    position_ids_->CreateTensor(position_ids_shape_, state_.params_->use_graph_capture && position_ids_shape_[2] == 1);
-  }
+  // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs)
+  auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_);
 
   // Update position values for generation phase
   // During generation, we increment all 3 dimensions uniformly for text generation
   if (type_ == Ort::TypeToTensorType<int32_t>) {
-    auto* data = position_ids_->GetMutableData<int32_t>();
+    auto* data = position_ids->GetTensorMutableData<int32_t>();
     for (int64_t dim = 0; dim < 3; ++dim) {
       for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
         for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
@@ -634,7 +629,7 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length
       }
     }
   } else {
-    auto* data = position_ids_->GetMutableData<int64_t>();
+    auto* data = position_ids->GetTensorMutableData<int64_t>();
     for (int64_t dim = 0; dim < 3; ++dim) {
       for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
         for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
@@ -645,27 +640,26 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length
     }
   }
 
+  // Move to GPU if needed
+  position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1);
   state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
 }
 
 void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length) {
-  if (attention_mask_next_ && attention_mask_shape_[1] == total_length - 1) {
-    attention_mask_ = std::move(attention_mask_next_);
-    attention_mask_next_ = nullptr;
+  // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs)
+  auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, attention_mask_shape_, type_);
+
+  // Update attention mask - typically all 1s during generation
+  if (type_ == Ort::TypeToTensorType<int32_t>) {
+    auto* mask_data = attention_mask->GetTensorMutableData<int32_t>();
+    std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int32_t>(1));
   } else {
-    attention_mask_->CreateTensor(attention_mask_shape_, state_.params_->use_graph_capture && attention_mask_shape_[1] == 1);
+    auto* mask_data = attention_mask->GetTensorMutableData<int64_t>();
+    std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int64_t>(1));
   }
 
-  if (!state_.params_->use_graph_capture || attention_mask_shape_[1] != 1) {
-    // Update attention mask - typically all 1s during generation
-    if (type_ == Ort::TypeToTensorType<int32_t>) {
-      auto* mask_data = attention_mask_->GetMutableData<int32_t>();
-      std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int32_t>(1));
-    } else {
-      auto* mask_data = attention_mask_->GetMutableData<int64_t>();
-      std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int64_t>(1));
-    }
-  }
+  // Move to GPU if needed
+  attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, 1);
 
   state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor();
 }
@@ -679,6 +673,7 @@ void Qwen2VLPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_le
       else
         CreateAndInitialize3DPositionIDs<int64_t>(next_tokens, position_ids_shape_);
     } else {
+      position_ids_shape_[2] = new_length;  // Update shape before Update3DPositionIDs
       Update3DPositionIDs(total_length, new_length);
     }
   }
@@ -691,7 +686,9 @@ void Qwen2VLPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_le
       else
         CreateAndInitializeAttentionMask<int64_t>(next_tokens, attention_mask_shape_);
     } else {
+      // UpdateAttentionMask checks old shape, then we update it
       UpdateAttentionMask(total_length, new_length);
+      attention_mask_shape_[1] = total_length;  // Update to current total length
     }
   }
   
diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp
index 75c46790b1..2831f9cbc1 100644
--- a/src/models/qwen_image_processor.cpp
+++ b/src/models/qwen_image_processor.cpp
@@ -12,28 +12,35 @@ namespace {
 
 std::tuple<std::unique_ptr<OrtValue>, std::unique_ptr<OrtValue>>
 ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt,
-                   OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, Ort::Allocator& allocator) {
+                   OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, 
+                   const int64_t* computed_grid_data, int64_t computed_grid_num_images,
+                   Ort::Allocator& allocator) {
   constexpr char vision_start_token[] = "<|vision_start|>";
   constexpr char vision_end_token[] = "<|vision_end|>";
   constexpr char image_pad_token[] = "<|image_pad|>";
 
   int64_t num_images = 0;
   int64_t total_image_tokens = 0;
+  const int64_t* image_grid_thw_data = nullptr;
   
-  if (pixel_values && image_grid_thw) {
+  if (pixel_values) {
     const float* pixel_values_data{};
     const int64_t* pixel_values_shape{};
     size_t pixel_values_num_dims;
     CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast<const void**>(&pixel_values_data),
                                   &pixel_values_shape, &pixel_values_num_dims));
     
-    const int64_t* image_grid_thw_data{};
-    const int64_t* image_grid_thw_shape{};
-    size_t image_grid_thw_num_dims;
-    CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast<const void**>(&image_grid_thw_data),
-                                  &image_grid_thw_shape, &image_grid_thw_num_dims));
-    
-    num_images = image_grid_thw_shape[0];
+    // Get image_grid_thw data from either processor output or computed value
+    if (image_grid_thw) {
+      const int64_t* image_grid_thw_shape{};
+      size_t image_grid_thw_num_dims;
+      CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast<const void**>(&image_grid_thw_data),
+                                    &image_grid_thw_shape, &image_grid_thw_num_dims));
+      num_images = image_grid_thw_shape[0];
+    } else if (computed_grid_data) {
+      image_grid_thw_data = computed_grid_data;
+      num_images = computed_grid_num_images;
+    }
     
     // Calculate total image tokens based on grid dimensions
     // For each image: (temporal * height * width) / (merge_size^2)
@@ -60,7 +67,8 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
   }
 
   // Count the number of vision_start tokens and make sure it matches the number of images
-  const std::regex vision_start_regex{std::string(vision_start_token)};
+  // Need to escape special regex characters in the token
+  const std::regex vision_start_regex{R"(<\|vision_start\|>)"};
   const auto vision_start_begin = std::sregex_iterator(text.begin(), text.end(), vision_start_regex);
   const auto vision_start_end = std::sregex_iterator();
   const auto vision_start_tokens = std::distance(vision_start_begin, vision_start_end);
@@ -72,13 +80,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
 
   // For Qwen2-VL, we need to replace vision markers with image_pad tokens
   // The number of image_pad tokens for each image depends on the image dimensions
-  if (num_images > 0 && image_grid_thw) {
-    const int64_t* image_grid_thw_data{};
-    const int64_t* image_grid_thw_shape{};
-    size_t image_grid_thw_num_dims;
-    CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast<const void**>(&image_grid_thw_data),
-                                  &image_grid_thw_shape, &image_grid_thw_num_dims));
-    
+  if (num_images > 0 && image_grid_thw_data) {
     constexpr int64_t merge_size = 2;
     std::string modified_text;
     size_t last_pos = 0;
@@ -149,7 +151,7 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   auto named_tensors = std::make_unique<NamedTensors>();
 
   if (!images) {
-    [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, allocator);
+    [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, nullptr, 0, allocator);
     named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared<Tensor>(std::move(input_ids)));
     return named_tensors;
   }
@@ -161,12 +163,128 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values));
 
   OrtxTensor* image_grid_thw = nullptr;
-  CheckResult(OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw));
+  // Try to get image_grid_thw from processor (second output)
+  auto status = OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw);
+  
+  // Get pixel_values data and shape
+  const float* pixel_values_data{};
+  const int64_t* pixel_values_shape{};
+  size_t pixel_values_num_dims;
+  CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast<const void**>(&pixel_values_data),
+                                &pixel_values_shape, &pixel_values_num_dims));
+  
+  std::cerr << "DEBUG: pixel_values_num_dims=" << pixel_values_num_dims << " shape=[";
+  for (size_t i = 0; i < pixel_values_num_dims; ++i) {
+    if (i > 0) std::cerr << ", ";
+    std::cerr << pixel_values_shape[i];
+  }
+  std::cerr << "]" << std::endl;
+  
+  // If processor doesn't provide image_grid_thw or patched pixel_values, compute them
+  std::unique_ptr<OrtValue> computed_image_grid_thw;
+  std::unique_ptr<OrtValue> patched_pixel_values;
+  const int64_t* computed_grid_data = nullptr;
+  int64_t computed_grid_num_images = 0;
+  
+  // Check if pixel_values needs patching (shape should be [1, height, width, channels] in HWC format)
+  if (pixel_values_num_dims == 4 && pixel_values_shape[0] == 1) {
+    constexpr int64_t patch_size = 14;
+    constexpr int64_t temporal_patch_size = 2;
+    
+    int64_t height = pixel_values_shape[1];      // HWC: [batch, height, width, channels]
+    int64_t width = pixel_values_shape[2];
+    int64_t channels = pixel_values_shape[3];
+    
+    int64_t height_patches = height / patch_size;
+    int64_t width_patches = width / patch_size;
+    int64_t total_patches = height_patches * width_patches;
+    int64_t patch_dim = channels * temporal_patch_size * patch_size * patch_size;  // 3*2*14*14 = 1176
+    
+    // Create patched pixel_values: [total_patches, patch_dim]
+    patched_pixel_values = OrtValue::CreateTensor<float>(
+        allocator, std::vector<int64_t>{total_patches, patch_dim});
+    auto* patched_data = patched_pixel_values->GetTensorMutableData<float>();
+    
+    // Extract patches from single image in HWC format
+    // Each spatial patch is replicated temporal_patch_size times
+    int64_t patch_idx = 0;
+    for (int64_t ph = 0; ph < height_patches; ++ph) {
+      for (int64_t pw = 0; pw < width_patches; ++pw) {
+        int64_t h_start = ph * patch_size;
+        int64_t w_start = pw * patch_size;
+        
+        int64_t write_idx = patch_idx * patch_dim;
+        
+        // Repeat the same spatial patch temporal_patch_size times
+        // Output: [temporal, channels, patch_h, patch_w]
+        for (int64_t t = 0; t < temporal_patch_size; ++t) {
+          for (int64_t c = 0; c < channels; ++c) {
+            for (int64_t h = 0; h < patch_size; ++h) {
+              for (int64_t w = 0; w < patch_size; ++w) {
+                // HWC format: pixel_values[height][width][channels]
+                int64_t src_idx = (h_start + h) * width * channels + (w_start + w) * channels + c;
+                patched_data[write_idx++] = pixel_values_data[src_idx];
+              }
+            }
+          }
+        }
+        patch_idx++;
+      }
+    }
+    
+    // Create image_grid_thw: [1, 3] for single image
+    if (status != kOrtxOK || !image_grid_thw) {
+      computed_image_grid_thw = OrtValue::CreateTensor<int64_t>(
+          allocator, std::vector<int64_t>{1, 3});
+      auto* grid_data = computed_image_grid_thw->GetTensorMutableData<int64_t>();
+      
+      // For a single image: T=1 (one frame), H=height_patches, W=width_patches
+      // The temporal_patch_size is embedded in the patch dimension (1176 = 3*2*14*14)
+      grid_data[0] = 1;  // Single temporal frame for images
+      grid_data[1] = height_patches;
+      grid_data[2] = width_patches;
+      
+      computed_grid_data = grid_data;
+      computed_grid_num_images = 1;
+    }
+  }
 
-  auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, image_grid_thw, allocator);
+  auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, 
+                                                          image_grid_thw, computed_grid_data, computed_grid_num_images, allocator);
   named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared<Tensor>(std::move(input_ids)));
 
-  if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+  // Use patched pixel_values if we computed it, otherwise use processor output
+  if (patched_pixel_values) {
+    // Convert to the correct type if needed
+    if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
+      // Convert float to bfloat16
+      auto shape_vec = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetShape();
+      auto bf16_tensor = OrtValue::CreateTensor<Ort::BFloat16_t>(allocator, shape_vec);
+      const float* src = patched_pixel_values->GetTensorData<float>();
+      auto* dst = static_cast<uint16_t*>(bf16_tensor->GetTensorMutableData<void>());
+      size_t count = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetElementCount();
+      for (size_t i = 0; i < count; ++i) {
+        dst[i] = Float32ToBFloat16(src[i]);
+      }
+      named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
+                             std::make_shared<Tensor>(std::move(bf16_tensor)));
+    } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
+      // Convert float to float16
+      auto shape_vec = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetShape();
+      auto fp16_tensor = OrtValue::CreateTensor<Ort::Float16_t>(allocator, shape_vec);
+      const float* src = patched_pixel_values->GetTensorData<float>();
+      auto* dst = static_cast<uint16_t*>(fp16_tensor->GetTensorMutableData<void>());
+      size_t count = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetElementCount();
+      for (size_t i = 0; i < count; ++i) {
+        dst[i] = FastFloat32ToFloat16(src[i]);
+      }
+      named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
+                             std::make_shared<Tensor>(std::move(fp16_tensor)));
+    } else {
+      named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
+                             std::make_shared<Tensor>(std::move(patched_pixel_values)));
+    }
+  } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
     named_tensors->emplace(std::string(Config::Defaults::PixelValuesName),
                            std::make_shared<Tensor>(ProcessTensor<float>(pixel_values, allocator)));
   } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
@@ -177,9 +295,14 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
                            std::make_shared<Tensor>(ProcessTensor<Ort::Float16_t>(pixel_values, allocator)));
   }
 
-  // Add image_grid_thw tensor
-  named_tensors->emplace("image_grid_thw",
-                         std::make_shared<Tensor>(ProcessTensor<int64_t>(image_grid_thw, allocator)));
+  // Add image_grid_thw tensor (either from processor or computed)
+  if (image_grid_thw) {
+    named_tensors->emplace("image_grid_thw",
+                           std::make_shared<Tensor>(ProcessTensor<int64_t>(image_grid_thw, allocator)));
+  } else if (computed_image_grid_thw) {
+    named_tensors->emplace("image_grid_thw",
+                           std::make_shared<Tensor>(std::move(computed_image_grid_thw)));
+  }
 
   named_tensors->emplace(std::string(Config::Defaults::NumImageTokens), std::make_shared<Tensor>(std::move(num_img_tokens)));
 

From ecc42044317f9473295d45ec1b1006dbbcdee40a Mon Sep 17 00:00:00 2001
From: apsonawane <asonawane@microsoft.com>
Date: Sat, 15 Nov 2025 01:48:55 +0000
Subject: [PATCH 4/7] Remove debug prints

---
 src/models/qwen_image_processor.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp
index 2831f9cbc1..3df26425c8 100644
--- a/src/models/qwen_image_processor.cpp
+++ b/src/models/qwen_image_processor.cpp
@@ -173,13 +173,6 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast<const void**>(&pixel_values_data),
                                 &pixel_values_shape, &pixel_values_num_dims));
   
-  std::cerr << "DEBUG: pixel_values_num_dims=" << pixel_values_num_dims << " shape=[";
-  for (size_t i = 0; i < pixel_values_num_dims; ++i) {
-    if (i > 0) std::cerr << ", ";
-    std::cerr << pixel_values_shape[i];
-  }
-  std::cerr << "]" << std::endl;
-  
   // If processor doesn't provide image_grid_thw or patched pixel_values, compute them
   std::unique_ptr<OrtValue> computed_image_grid_thw;
   std::unique_ptr<OrtValue> patched_pixel_values;

From df1d9630e2f77813b10e1e97f726a6a385f2a84b Mon Sep 17 00:00:00 2001
From: apsonawane <asonawane@microsoft.com>
Date: Sat, 15 Nov 2025 02:24:28 +0000
Subject: [PATCH 5/7] Cleanup

---
 src/models/position_inputs.cpp      | 128 ++++++++--------------------
 src/models/position_inputs.h        |   9 +-
 src/models/qwen_image_processor.cpp |  35 ++++----
 3 files changed, 55 insertions(+), 117 deletions(-)

diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index 0787e34691..585ff1cc7f 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -5,6 +5,15 @@
 
 namespace Generators {
 
+// Helper to dispatch type-specific tensor operations
+template<typename Func>
+void DispatchOnType(ONNXTensorElementDataType type, Func&& func) {
+  if (type == Ort::TypeToTensorType<int32_t>)
+    func.template operator()<int32_t>();
+  else
+    func.template operator()<int64_t>();
+}
+
 DefaultPositionInputs::DefaultPositionInputs(const Model& model, State& state, DeviceSpan<int32_t> sequence_lengths_unk, const std::string& attention_mask_name)
     : model_{model},
       state_{state},
@@ -501,13 +510,11 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D
     position_ids_shape_[2] = 0;  // Will be set during first update
 
     position_ids_ = std::make_unique<Tensor>(model_.p_device_inputs_, posid_type);
-    position_ids_next_ = std::make_unique<Tensor>(model_.p_device_inputs_, posid_type);
   }
   if (has_mask_input_) {
     attention_mask_shape_[0] = state_.params_->search.batch_size;
     attention_mask_shape_[1] = 0;  // Will be set during first update
     attention_mask_ = std::make_unique<Tensor>(model_.p_device_inputs_, type_);
-    attention_mask_next_ = std::make_unique<Tensor>(model_.p_device_inputs_, type_);
   }
 }
 
@@ -534,161 +541,94 @@ void Qwen2VLPositionInputs::AddAttentionMask() {
 
 template <typename T>
 void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape) {
-  // For Qwen2-VL, in the prefill stage, position_ids are [3, batch_size, seq_len]
-  // During generation, they remain [3, batch_size, 1]
+  // For Qwen2-VL, position_ids are [3, batch_size, seq_len]
   // The 3 dimensions represent: [temporal, height, width] for mrope
+  // For text-only content, all 3 dimensions have the same position values [0,1,2,...]
   
   auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
   auto* position_data = position_ids->GetTensorMutableData<T>();
-  
-  auto position_ids_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array<int64_t, 3>{shape[0], shape[1], 1}, type_);
-  auto* position_data_next = position_ids_next->GetTensorMutableData<T>();
 
-  // Initialize position IDs
-  // For text-only content (no vision), all 3 dimensions have the same position values
-  // This matches the PyTorch get_rope_index behavior where text positions are [0,1,2,...]
-  // replicated across all 3 mrope dimensions
-  
   // Fill position_ids: shape is [3, batch_size, seq_len]
   for (int64_t dim = 0; dim < 3; ++dim) {
     for (int64_t batch = 0; batch < shape[1]; ++batch) {
       for (int64_t pos = 0; pos < shape[2]; ++pos) {
-        // All 3 dimensions get the same sequential position values for text
         position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast<T>(pos);
       }
     }
   }
-  
-  // Fill position_ids_next for generation: shape is [3, batch_size, 1]
-  for (int64_t dim = 0; dim < 3; ++dim) {
-    for (int64_t batch = 0; batch < shape[1]; ++batch) {
-      // Next position is seq_len (continuing from last position)
-      position_data_next[dim * shape[1] + batch] = static_cast<T>(shape[2]);
-    }
-  }
-  
-  // Old multi-batch code removed since we simplified to match PyTorch logic
-  if (false) {
-    // Multiple batches - initialize with simple ascending values
-    // In practice, vision-specific positions would be computed by the model's get_rope_index logic
-    for (int64_t dim = 0; dim < 3; ++dim) {
-      for (int64_t batch = 0; batch < shape[1]; ++batch) {
-        for (int64_t pos = 0; pos < shape[2]; ++pos) {
-          position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast<T>(pos);
-        }
-        position_data_next[dim * shape[1] + batch] = static_cast<T>(shape[2]);
-      }
-    }
-  }
 
-  // Move tensors to appropriate device and expand by num_beams
+  // Move tensor to GPU and expand by num_beams
   position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, state_.params_->search.num_beams);
-  position_ids_next_->ort_tensor_ = model_.ExpandInputs(position_ids_next, state_.params_->search.num_beams);
-  if (state_.params_->use_graph_capture)
-    position_ids_next_->MakeStatic();
   position_ids_shape_[1] *= state_.params_->search.num_beams;
   state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
 }
 
 template <typename T>
 void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 2> shape) {
-  // Standard 2D attention mask initialization
   auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
   auto* mask_data = attention_mask->GetTensorMutableData<T>();
-  
-  auto attention_mask_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array<int64_t, 2>{shape[0], shape[1] + 1}, type_);
-  auto* mask_data_next = attention_mask_next->GetTensorMutableData<T>();
 
-  // Set mask to 1 for all positions (assuming no padding in first iteration)
+  // Set mask to 1 for all positions (no padding)
   std::fill_n(mask_data, shape[0] * shape[1], static_cast<T>(1));
-  std::fill_n(mask_data_next, shape[0] * (shape[1] + 1), static_cast<T>(1));
 
-  // Move tensors to device and expand by num_beams
+  // Move tensor to GPU and expand by num_beams
   attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams);
-  attention_mask_next_->ort_tensor_ = model_.ExpandInputs(attention_mask_next, state_.params_->search.num_beams);
-  if (state_.params_->use_graph_capture)
-    attention_mask_next_->MakeStatic();
   attention_mask_shape_[0] *= state_.params_->search.num_beams;
   state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor();
 }
 
-void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length) {
-  // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs)
+void Qwen2VLPositionInputs::Update3DPositionIDs(int base_pos) {
   auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_);
 
-  // Update position values for generation phase
-  // During generation, we increment all 3 dimensions uniformly for text generation
-  if (type_ == Ort::TypeToTensorType<int32_t>) {
-    auto* data = position_ids->GetTensorMutableData<int32_t>();
+  DispatchOnType(type_, [&]<typename T>() {
+    auto* data = position_ids->GetTensorMutableData<T>();
     for (int64_t dim = 0; dim < 3; ++dim) {
       for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
         for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
           data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
-            static_cast<int32_t>(total_length - new_length + pos);
+            static_cast<T>(base_pos + pos);
         }
       }
     }
-  } else {
-    auto* data = position_ids->GetTensorMutableData<int64_t>();
-    for (int64_t dim = 0; dim < 3; ++dim) {
-      for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
-        for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
-          data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
-            static_cast<int64_t>(total_length - new_length + pos);
-        }
-      }
-    }
-  }
+  });
 
-  // Move to GPU if needed
   position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1);
   state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
 }
 
-void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length) {
-  // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs)
+void Qwen2VLPositionInputs::UpdateAttentionMask() {
   auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, attention_mask_shape_, type_);
 
-  // Update attention mask - typically all 1s during generation
-  if (type_ == Ort::TypeToTensorType<int32_t>) {
-    auto* mask_data = attention_mask->GetTensorMutableData<int32_t>();
-    std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int32_t>(1));
-  } else {
-    auto* mask_data = attention_mask->GetTensorMutableData<int64_t>();
-    std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<int64_t>(1));
-  }
+  DispatchOnType(type_, [&]<typename T>() {
+    auto* mask_data = attention_mask->GetTensorMutableData<T>();
+    std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast<T>(1));
+  });
 
-  // Move to GPU if needed
   attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, 1);
-
   state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor();
 }
 
 void Qwen2VLPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_length, int new_length) {
   if (has_posid_input_) {
+    position_ids_shape_[2] = new_length;
     if (is_first_update_) {
-      position_ids_shape_[2] = new_length;
-      if (type_ == Ort::TypeToTensorType<int32_t>)
-        CreateAndInitialize3DPositionIDs<int32_t>(next_tokens, position_ids_shape_);
-      else
-        CreateAndInitialize3DPositionIDs<int64_t>(next_tokens, position_ids_shape_);
+      DispatchOnType(type_, [&]<typename T>() {
+        CreateAndInitialize3DPositionIDs<T>(next_tokens, position_ids_shape_);
+      });
     } else {
-      position_ids_shape_[2] = new_length;  // Update shape before Update3DPositionIDs
-      Update3DPositionIDs(total_length, new_length);
+      Update3DPositionIDs(total_length - new_length);
     }
   }
 
   if (has_mask_input_) {
     if (is_first_update_) {
       attention_mask_shape_[1] = new_length;
-      if (type_ == Ort::TypeToTensorType<int32_t>)
-        CreateAndInitializeAttentionMask<int32_t>(next_tokens, attention_mask_shape_);
-      else
-        CreateAndInitializeAttentionMask<int64_t>(next_tokens, attention_mask_shape_);
+      DispatchOnType(type_, [&]<typename T>() {
+        CreateAndInitializeAttentionMask<T>(next_tokens, attention_mask_shape_);
+      });
     } else {
-      // UpdateAttentionMask checks old shape, then we update it
-      UpdateAttentionMask(total_length, new_length);
-      attention_mask_shape_[1] = total_length;  // Update to current total length
+      attention_mask_shape_[1] = total_length;
+      UpdateAttentionMask();
     }
   }
   
diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
index 644f0f3a63..867196e008 100644
--- a/src/models/position_inputs.h
+++ b/src/models/position_inputs.h
@@ -129,11 +129,11 @@ struct Qwen2VLPositionInputs : PositionInputs {
   
   template <typename T>
   void CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape);
-  void Update3DPositionIDs(int total_length, int new_length);
+  void Update3DPositionIDs(int base_pos);
   
   template <typename T>
   void CreateAndInitializeAttentionMask(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 2> shape);
-  void UpdateAttentionMask(int total_length, int new_length);
+  void UpdateAttentionMask();
 
   const Model& model_;
   State& state_;
@@ -146,15 +146,12 @@ struct Qwen2VLPositionInputs : PositionInputs {
   bool has_mask_input_{false};
   bool has_posid_input_{false};
 
-  std::array<int64_t, 3> position_ids_shape_{};  // {4, batch_size, sequence_length} for 3D positions
+  std::array<int64_t, 3> position_ids_shape_{};  // {3, batch_size, sequence_length} for 3D positions
   std::unique_ptr<Tensor> position_ids_;
-  std::unique_ptr<Tensor> position_ids_next_;  // Replaces position_ids_ after the first Run() call
   
   std::array<int64_t, 2> attention_mask_shape_{};  // {batch_size, sequence_length}
   std::unique_ptr<Tensor> attention_mask_;
-  std::unique_ptr<Tensor> attention_mask_next_;  // Replaces attention_mask_ after each run
   
-  std::unique_ptr<Tensor> rope_deltas_;  // Cached rope deltas for position calculation
   bool is_first_update_{true};
 };
 
diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp
index 3df26425c8..198d087488 100644
--- a/src/models/qwen_image_processor.cpp
+++ b/src/models/qwen_image_processor.cpp
@@ -10,6 +10,8 @@ namespace Generators {
 
 namespace {
 
+constexpr int64_t kMergeSize = 2;  // Qwen2-VL merge size for vision tokens
+
 std::tuple<std::unique_ptr<OrtValue>, std::unique_ptr<OrtValue>>
 ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt,
                    OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, 
@@ -44,12 +46,11 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
     
     // Calculate total image tokens based on grid dimensions
     // For each image: (temporal * height * width) / (merge_size^2)
-    constexpr int64_t merge_size = 2;
     for (int64_t i = 0; i < num_images; ++i) {
       int64_t t = image_grid_thw_data[i * 3 + 0];
       int64_t h = image_grid_thw_data[i * 3 + 1];
       int64_t w = image_grid_thw_data[i * 3 + 2];
-      total_image_tokens += (t * h * w) / (merge_size * merge_size);
+      total_image_tokens += (t * h * w) / (kMergeSize * kMergeSize);
     }
   }
 
@@ -81,7 +82,6 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
   // For Qwen2-VL, we need to replace vision markers with image_pad tokens
   // The number of image_pad tokens for each image depends on the image dimensions
   if (num_images > 0 && image_grid_thw_data) {
-    constexpr int64_t merge_size = 2;
     std::string modified_text;
     size_t last_pos = 0;
     size_t image_idx = 0;
@@ -96,7 +96,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
       int64_t t = image_grid_thw_data[image_idx * 3 + 0];
       int64_t h = image_grid_thw_data[image_idx * 3 + 1];
       int64_t w = image_grid_thw_data[image_idx * 3 + 2];
-      int64_t num_pads = (t * h * w) / (merge_size * merge_size);
+      int64_t num_pads = (t * h * w) / (kMergeSize * kMergeSize);
       
       // Add vision_start, image_pad tokens, and vision_end
       modified_text += vision_start_token;
@@ -181,17 +181,18 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   
   // Check if pixel_values needs patching (shape should be [1, height, width, channels] in HWC format)
   if (pixel_values_num_dims == 4 && pixel_values_shape[0] == 1) {
-    constexpr int64_t patch_size = 14;
-    constexpr int64_t temporal_patch_size = 2;
+    constexpr int64_t kPatchSize = 14;
+    constexpr int64_t kTemporalPatchSize = 2;
+    constexpr int64_t kChannels = 3;
     
     int64_t height = pixel_values_shape[1];      // HWC: [batch, height, width, channels]
     int64_t width = pixel_values_shape[2];
     int64_t channels = pixel_values_shape[3];
     
-    int64_t height_patches = height / patch_size;
-    int64_t width_patches = width / patch_size;
+    int64_t height_patches = height / kPatchSize;
+    int64_t width_patches = width / kPatchSize;
     int64_t total_patches = height_patches * width_patches;
-    int64_t patch_dim = channels * temporal_patch_size * patch_size * patch_size;  // 3*2*14*14 = 1176
+    int64_t patch_dim = channels * kTemporalPatchSize * kPatchSize * kPatchSize;
     
     // Create patched pixel_values: [total_patches, patch_dim]
     patched_pixel_values = OrtValue::CreateTensor<float>(
@@ -199,21 +200,21 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
     auto* patched_data = patched_pixel_values->GetTensorMutableData<float>();
     
     // Extract patches from single image in HWC format
-    // Each spatial patch is replicated temporal_patch_size times
+    // Each spatial patch is replicated kTemporalPatchSize times
     int64_t patch_idx = 0;
     for (int64_t ph = 0; ph < height_patches; ++ph) {
       for (int64_t pw = 0; pw < width_patches; ++pw) {
-        int64_t h_start = ph * patch_size;
-        int64_t w_start = pw * patch_size;
+        int64_t h_start = ph * kPatchSize;
+        int64_t w_start = pw * kPatchSize;
         
         int64_t write_idx = patch_idx * patch_dim;
         
-        // Repeat the same spatial patch temporal_patch_size times
+        // Repeat the same spatial patch kTemporalPatchSize times
         // Output: [temporal, channels, patch_h, patch_w]
-        for (int64_t t = 0; t < temporal_patch_size; ++t) {
+        for (int64_t t = 0; t < kTemporalPatchSize; ++t) {
           for (int64_t c = 0; c < channels; ++c) {
-            for (int64_t h = 0; h < patch_size; ++h) {
-              for (int64_t w = 0; w < patch_size; ++w) {
+            for (int64_t h = 0; h < kPatchSize; ++h) {
+              for (int64_t w = 0; w < kPatchSize; ++w) {
                 // HWC format: pixel_values[height][width][channels]
                 int64_t src_idx = (h_start + h) * width * channels + (w_start + w) * channels + c;
                 patched_data[write_idx++] = pixel_values_data[src_idx];
@@ -232,7 +233,7 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
       auto* grid_data = computed_image_grid_thw->GetTensorMutableData<int64_t>();
       
       // For a single image: T=1 (one frame), H=height_patches, W=width_patches
-      // The temporal_patch_size is embedded in the patch dimension (1176 = 3*2*14*14)
+      // The kTemporalPatchSize is embedded in the patch dimension
       grid_data[0] = 1;  // Single temporal frame for images
       grid_data[1] = height_patches;
       grid_data[2] = width_patches;

From fa991b59a32808c8dba68b9109568b2ea1758311 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 25 Nov 2025 01:16:41 +0000
Subject: [PATCH 6/7] update position ids for mrope

---
 src/config.cpp                      |  10 ++
 src/config.h                        |  10 ++
 src/models/model.h                  |   2 +-
 src/models/multi_modal.cpp          |  35 +++-
 src/models/multi_modal.h            |   7 +-
 src/models/position_inputs.cpp      | 269 +++++++++++++++++++++++++---
 src/models/position_inputs.h        |  41 +++--
 src/models/qwen_image_processor.cpp |  16 +-
 src/models/qwen_image_processor.h   |   3 +-
 9 files changed, 340 insertions(+), 53 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index 9a9d48ab9b..3f46b990f8 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -653,6 +653,10 @@ struct Vision_Element : JSON::Element {
       v_.config_filename = JSON::Get<std::string_view>(value);
     } else if (name == "adapter_filename") {
       v_.adapter_filename = JSON::Get<std::string_view>(value);
+    } else if (name == "spatial_merge_size") {
+      v_.spatial_merge_size = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "tokens_per_second") {
+      v_.tokens_per_second = static_cast<float>(JSON::Get<double>(value));
     } else {
       throw JSON::unknown_value_error{};
     }
@@ -858,6 +862,12 @@ struct Model_Element : JSON::Element {
       v_.decoder_start_token_id = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "sep_token_id") {
       v_.sep_token_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "image_token_id") {
+      v_.image_token_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "video_token_id") {
+      v_.video_token_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "vision_start_token_id") {
+      v_.vision_start_token_id = static_cast<int>(JSON::Get<double>(value));
     } else {
       throw JSON::unknown_value_error{};
     }
diff --git a/src/config.h b/src/config.h
index 76b3d4c241..c03a1d1860 100644
--- a/src/config.h
+++ b/src/config.h
@@ -107,6 +107,12 @@ struct Config {
     int bos_token_id{};             // The id of the beginning-of-stream token.
     int sep_token_id{};             // The id of the separation token.
     int decoder_start_token_id{};   // If an encoder-decoder model starts decoding with a different token than bos, the id of that token.
+
+    // Qwen2-VL specific token IDs
+    int image_token_id{};
+    int video_token_id{};
+    int vision_start_token_id{};
+
     int vocab_size{};
     int context_length{};
 
@@ -160,6 +166,10 @@ struct Config {
       std::string config_filename{"processor_config.json"};
       std::optional<std::string> adapter_filename{};
 
+      // Qwen2-VL specific vision config values
+      int spatial_merge_size{2};
+      float tokens_per_second{2.0f};
+
       struct Inputs {
         std::string pixel_values{Defaults::PixelValuesName};
         std::string image_sizes{Defaults::ImageSizesName};
diff --git a/src/models/model.h b/src/models/model.h
index 0e50059702..62034bb1a3 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -177,4 +177,4 @@ struct Model : std::enable_shared_from_this<Model>, LeakChecked<Model>, External
   std::map<std::string, std::unique_ptr<OrtSessionOptions>> pipeline_session_options_;
 };
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp
index 48ad8d5fcf..113b70b1e4 100644
--- a/src/models/multi_modal.cpp
+++ b/src/models/multi_modal.cpp
@@ -3,6 +3,7 @@
 
 #include "../generators.h"
 #include "multi_modal.h"
+#include <numeric>
 
 namespace Generators {
 
@@ -178,10 +179,12 @@ DeviceSpan<float> EmbeddingState::Run(int current_length, DeviceSpan<int32_t>& n
   return {};
 }
 
-DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params)
+DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan<int32_t> sequence_lengths,
+                           const GeneratorParams& params)
     : State{params, model},
       model_{model},
-      position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} {
+      position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)}
+{
   inputs_embeds_.Add();
   position_inputs_->Add();
   logits_.Add();
@@ -207,6 +210,13 @@ void DecoderState::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int tot
   inputs_embeds_.UpdateSequenceLength(new_length);
 }
 
+// Overload for pipeline to call
+void DecoderState::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int total_length, DeviceSpan<int32_t> beam_indices, size_t new_length) {
+  kv_cache_.Update(beam_indices, total_length);
+  logits_.Update(next_tokens, new_length);
+  inputs_embeds_.UpdateSequenceLength(new_length);
+}
+
 MultiModalPipelineState::MultiModalPipelineState(const MultiModalLanguageModel& model, DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params)
     : State{params, model},
       model_{model},
@@ -243,6 +253,25 @@ void MultiModalPipelineState::SetExtraInputs(const std::vector<ExtraInput>& extr
     speech_state_->SetExtraInputs(extra_inputs, num_audio_tokens_);
   }
   embedding_state_->SetExtraInputs(num_images_, num_image_tokens_, num_audio_tokens_);
+
+  // Set the grid tensors for Qwen2-VL if present
+  if (auto* qwen_pos_inputs = dynamic_cast<Qwen2VLPositionInputs*>(decoder_state_->position_inputs_.get())) {
+    std::shared_ptr<Tensor> img_grid, vid_grid, sec_grid;
+
+    for (const auto& input : extra_inputs) {
+      if (input.name == Config::Defaults::ImageGridThwName) {
+        img_grid = input.tensor;
+      } else if (input.name == "video_grid_thw") {
+        vid_grid = input.tensor;
+      } else if (input.name == "second_per_grid_ts") {
+        sec_grid = input.tensor;
+      }
+    }
+
+    if (img_grid || vid_grid) {
+      qwen_pos_inputs->SetGridTensors(img_grid, vid_grid, sec_grid);
+    }
+  }
 }
 
 DeviceSpan<float> MultiModalPipelineState::Run(int current_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {
@@ -357,4 +386,4 @@ OrtValue* MultiModalPipelineState::GetOutput(const char* name) {
   return State::GetOutput(name);
 };
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/multi_modal.h b/src/models/multi_modal.h
index 0cbe4e527b..8f17b004b6 100644
--- a/src/models/multi_modal.h
+++ b/src/models/multi_modal.h
@@ -18,7 +18,7 @@ struct MultiModalLanguageModel : Model {
   MultiModalLanguageModel(const MultiModalLanguageModel&) = delete;
   MultiModalLanguageModel& operator=(const MultiModalLanguageModel&) = delete;
 
-  std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params) const;
+  std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params) const override;
 
   std::unique_ptr<OrtSession> vision_session_;     // pixel_values, [image_attention_mask], image_sizes -> image_features
   std::unique_ptr<OrtSession> speech_session_;     // audio_embeds, audio_sizes, audio_projection_mode -> audio_features
@@ -96,11 +96,12 @@ struct DecoderState : State {
   DecoderState& operator=(const DecoderState&) = delete;
 
   DeviceSpan<float> Run(int current_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) override;
+  void UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int current_length, DeviceSpan<int32_t> beam_indices);
 
  private:
   friend struct MultiModalPipelineState;
 
-  void UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int current_length, DeviceSpan<int32_t> beam_indices);
+  void UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int current_length, DeviceSpan<int32_t> beam_indices, size_t new_length);
 
   const MultiModalLanguageModel& model_;
   Embeddings inputs_embeds_{*this, Embeddings::Mode::Input,  // Model input
@@ -144,4 +145,4 @@ struct MultiModalPipelineState : State {
   const std::string speech_adapter_name_{"speech"};
 };
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index 585ff1cc7f..5ed02b751e 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -2,11 +2,14 @@
 #include "model.h"
 #include "position_inputs.h"
 #include "model_type.h"
+#include <vector>
+#include <numeric>
+#include <cmath> // For std::round
 
 namespace Generators {
 
 // Helper to dispatch type-specific tensor operations
-template<typename Func>
+template <typename Func>
 void DispatchOnType(ONNXTensorElementDataType type, Func&& func) {
   if (type == Ort::TypeToTensorType<int32_t>)
     func.template operator()<int32_t>();
@@ -490,7 +493,12 @@ void WindowedPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_l
 // Qwen2VLPositionInputs implementation
 Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan<int32_t> sequence_lengths_unk)
     : model_{model},
-      state_{state} {
+      state_{state},
+      image_token_id_{model.config_->model.image_token_id},
+      video_token_id_{model.config_->model.video_token_id},
+      vision_start_token_id_{model.config_->model.vision_start_token_id},
+      tokens_per_second_{model.config_->model.vision.tokens_per_second},
+      spatial_merge_size_{model.config_->model.vision.spatial_merge_size} {
   has_mask_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.attention_mask);
   has_posid_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.position_ids);
 
@@ -498,11 +506,10 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D
   if (has_mask_input_) {
     type_ = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.attention_mask);
   }
-  
-  ONNXTensorElementDataType posid_type = type_;
+
   if (has_posid_input_) {
-    posid_type = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids);
-    
+    ONNXTensorElementDataType posid_type = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids);
+
     // Set up 3D position IDs shape: [3, batch_size, sequence_length]
     // The 3 dimensions represent temporal, height, and width for mrope
     position_ids_shape_[0] = 3;
@@ -518,6 +525,14 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D
   }
 }
 
+void Qwen2VLPositionInputs::SetGridTensors(const std::shared_ptr<Tensor>& image_grid_thw,
+                                           const std::shared_ptr<Tensor>& video_grid_thw,
+                                           const std::shared_ptr<Tensor>& second_per_grid_ts) {
+  image_grid_thw_ = image_grid_thw;
+  video_grid_thw_ = video_grid_thw;
+  second_per_grid_ts_ = second_per_grid_ts;
+}
+
 void Qwen2VLPositionInputs::Add() {
   if (has_posid_input_) {
     AddPositionIDs();
@@ -541,35 +556,221 @@ void Qwen2VLPositionInputs::AddAttentionMask() {
 
 template <typename T>
 void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape) {
-  // For Qwen2-VL, position_ids are [3, batch_size, seq_len]
-  // The 3 dimensions represent: [temporal, height, width] for mrope
-  // For text-only content, all 3 dimensions have the same position values [0,1,2,...]
-  
+  // Replicates the logic from HuggingFace's `get_rope_index`
+  // `shape` is [3, batch_size, seq_len] (before beam expansion)
+  // `next_tokens` is [batch_size, seq_len]
+  int64_t num_dims = shape[0];  // Should be 3
+  int64_t batch_size = shape[1];
+  int64_t seq_len = shape[2];
+
   auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
   auto* position_data = position_ids->GetTensorMutableData<T>();
 
-  // Fill position_ids: shape is [3, batch_size, seq_len]
-  for (int64_t dim = 0; dim < 3; ++dim) {
-    for (int64_t batch = 0; batch < shape[1]; ++batch) {
-      for (int64_t pos = 0; pos < shape[2]; ++pos) {
-        position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast<T>(pos);
+  // Get spans for grid_thw tensors (on CPU)
+  std::span<const int64_t> image_grid_thw_span;
+  if (image_grid_thw_) {
+    image_grid_thw_span = std::span(image_grid_thw_->GetData<int64_t>(), image_grid_thw_->GetElementCount());
+  }
+
+  std::span<const int64_t> video_grid_thw_span;
+  if (video_grid_thw_) {
+    video_grid_thw_span = std::span(video_grid_thw_->GetData<int64_t>(), video_grid_thw_->GetElementCount())  ;
+  }
+
+  std::span<const float> second_per_grid_ts_span;
+  if (second_per_grid_ts_) {
+    // Qwen 2.5 processor outputs float32 for this
+    if (second_per_grid_ts_->GetType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT)
+      throw std::runtime_error("second_per_grid_ts must be float32.");
+    second_per_grid_ts_span = std::span(second_per_grid_ts_->GetData<float>(), second_per_grid_ts_->GetElementCount());
+  }
+  
+  auto input_ids_span = next_tokens.CpuSpan();
+  int image_index = 0;
+  int video_index = 0;
+  rope_deltas_.clear();
+
+  for (int64_t b = 0; b < batch_size; ++b) {
+    auto input_ids = input_ids_span.subspan(b * seq_len, seq_len);
+
+    int64_t image_nums = 0;
+    int64_t video_nums = 0;
+
+    // Count images/videos for this batch item by checking the token *after* vision_start_token_id
+    for (size_t s = 0; s < seq_len - 1; ++s) {
+      if (input_ids[s] == vision_start_token_id_) {
+        if (input_ids[s + 1] == image_token_id_) {
+          image_nums++;
+        } else if (input_ids[s + 1] == video_token_id_) {
+          video_nums++;
+        }
+      }
+    }
+
+    int64_t st = 0;
+    int64_t remain_images = image_nums;
+    int64_t remain_videos = video_nums;
+    T st_idx = 0;
+    T max_pos_for_batch = 0;
+
+    for (int64_t k = 0; k < image_nums + video_nums; ++k) {
+      int64_t ed_image = seq_len + 1;
+      int64_t ed_video = seq_len + 1;
+
+      // Find next image_token_id (after a vision_start_token_id)
+      if (remain_images > 0) {
+        for (int64_t s = st; s < seq_len - 1; ++s) {
+          if (input_ids[s] == vision_start_token_id_ && input_ids[s + 1] == image_token_id_) {
+            ed_image = s + 1;  // Point to the image_token_id
+            break;
+          }
+        }
+      }
+      // Find next video_token_id (after a vision_start_token_id)
+      if (remain_videos > 0) {
+        for (int64_t s = st; s < seq_len - 1; ++s) {
+          if (input_ids[s] == vision_start_token_id_ && input_ids[s + 1] == video_token_id_) {
+            ed_video = s + 1;  // Point to the video_token_id
+            break;
+          }
+        }
+      }
+
+      int64_t ed;
+      int64_t t, h, w;
+      float second_per_grid_t = 0.0f;
+
+      if (ed_image < ed_video) {
+        // Process image
+        if (image_index * 3 + 2 >= image_grid_thw_span.size())
+          throw std::runtime_error("Not enough image_grid_thw data for image tokens.");
+        t = image_grid_thw_span[image_index * 3 + 0];
+        h = image_grid_thw_span[image_index * 3 + 1];
+        w = image_grid_thw_span[image_index * 3 + 2];
+        second_per_grid_t = 0.0f;  // Images have 0 time delta
+        image_index++;
+        remain_images--;
+        ed = ed_image;
+      } else {
+        // Process video
+        if (video_index * 3 + 2 >= video_grid_thw_span.size())
+          throw std::runtime_error("Not enough video_grid_thw data for video tokens.");
+        t = video_grid_thw_span[video_index * 3 + 0];
+        h = video_grid_thw_span[video_index * 3 + 1];
+        w = video_grid_thw_span[video_index * 3 + 2];
+        if (second_per_grid_ts_span.empty() || video_index >= second_per_grid_ts_span.size()) {
+          second_per_grid_t = 1.0f;  // Default from Python
+        } else {
+          second_per_grid_t = second_per_grid_ts_span[video_index];
+        }
+        video_index++;
+        remain_videos--;
+        ed = ed_video;
+      }
+
+      int64_t llm_grid_t = t;
+      int64_t llm_grid_h = h / spatial_merge_size_;
+      int64_t llm_grid_w = w / spatial_merge_size_;
+
+      // 1. Fill Text Part
+      // Text runs from `st` up to `ed-1` (which is the <|vision_start|> token)
+      int64_t text_len = ed - st;
+      st_idx = (k > 0 || b > 0) ? max_pos_for_batch + 1 : 0;
+      T current_pos = st_idx;
+
+      for (int64_t s = 0; s < text_len; ++s) {
+        int64_t current_token_idx = st + s;
+        if (input_ids[current_token_idx] == model_.config_->model.pad_token_id) {
+          position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = 0;
+          position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = 0;
+          position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = 0;
+        } else {
+          position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos;
+          position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos;
+          position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos;
+          max_pos_for_batch = current_pos;
+          current_pos++;  // Only increment position for non-pad tokens
+        }
+      }
+
+      // 2. Fill Vision Part
+      st_idx = max_pos_for_batch + 1;
+      int64_t vision_len = llm_grid_t * llm_grid_h * llm_grid_w;
+      for (int64_t s = 0; s < vision_len; ++s) {
+        int64_t gt = s / (llm_grid_h * llm_grid_w);
+        int64_t gh = (s / llm_grid_w) % llm_grid_h;
+        int64_t gw = s % llm_grid_w;
+
+        // Round to nearest integer for temporal position
+        // Note: huggingface code use truncation/floor (time_tensor_long = time_tensor.long() when converting time coordinates.
+        // This will cause slight deviation from the reference during parity comparsion.
+        T t_pos = static_cast<T>(std::round(gt * second_per_grid_t * tokens_per_second_)) + st_idx;
+        T h_pos = static_cast<T>(gh) + st_idx;
+        T w_pos = static_cast<T>(gw) + st_idx;
+
+        // Vision tokens are guaranteed not to be padding
+        position_data[0 * batch_size * seq_len + b * seq_len + (ed + s)] = t_pos;
+        position_data[1 * batch_size * seq_len + b * seq_len + (ed + s)] = h_pos;
+        position_data[2 * batch_size * seq_len + b * seq_len + (ed + s)] = w_pos;
+        max_pos_for_batch = std::max({max_pos_for_batch, t_pos, h_pos, w_pos});
+      }
+      st = ed + vision_len;  // New start is after the vision tokens
+    }
+
+    // 3. Fill Remaining Text Part
+    if (st < seq_len) {
+      st_idx = (max_pos_for_batch == 0 && st == 0) ? 0 : max_pos_for_batch + 1;
+      int64_t text_len = seq_len - st;
+      T current_pos = st_idx;
+      for (int64_t s = 0; s < text_len; ++s) {
+        int64_t current_token_idx = st + s;
+        if (input_ids[current_token_idx] == model_.config_->model.pad_token_id) {
+          position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = 0;
+          position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = 0;
+          position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = 0;
+        } else {
+          position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos;
+          position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos;
+          position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos;
+          max_pos_for_batch = current_pos;
+          current_pos++;  // Only increment position for non-pad tokens
+        }
       }
     }
+    rope_deltas_.push_back(max_pos_for_batch + 1 - seq_len);
   }
 
   // Move tensor to GPU and expand by num_beams
   position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, state_.params_->search.num_beams);
   position_ids_shape_[1] *= state_.params_->search.num_beams;
   state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
+
+  // Expand rope_deltas_
+  std::vector<int64_t> expanded_deltas;
+  for (int64_t delta : rope_deltas_) {
+    for (int b = 0; b < state_.params_->search.num_beams; ++b) {
+      expanded_deltas.push_back(delta);
+    }
+  }
+  rope_deltas_ = std::move(expanded_deltas);
 }
 
 template <typename T>
 void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 2> shape) {
   auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_);
   auto* mask_data = attention_mask->GetTensorMutableData<T>();
-
-  // Set mask to 1 for all positions (no padding)
-  std::fill_n(mask_data, shape[0] * shape[1], static_cast<T>(1));
+  auto input_ids_span = next_tokens.CpuSpan();
+  int64_t batch_size = shape[0];
+  int64_t seq_len = shape[1];
+
+  for (int64_t b = 0; b < batch_size; ++b) {
+    for (int64_t s = 0; s < seq_len; ++s) {
+      int64_t current_token_idx = b * seq_len + s;
+      mask_data[current_token_idx] = (input_ids_span[current_token_idx] == model_.config_->model.pad_token_id)
+                                          ? static_cast<T>(0)
+                                          : static_cast<T>(1);
+    }
+  }
 
   // Move tensor to GPU and expand by num_beams
   attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams);
@@ -578,21 +779,34 @@ void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t>
 }
 
 void Qwen2VLPositionInputs::Update3DPositionIDs(int base_pos) {
+  // This is the generation step (decode)
+  // base_pos is cache_position[0]
   auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_);
+  int64_t batch_size = position_ids_shape_[1];  // This is already expanded (batch*beams)
+  int64_t seq_len = position_ids_shape_[2];    // This will be 1 for generation
+
+  if (rope_deltas_.size() != batch_size) {
+    throw std::runtime_error("rope_deltas size mismatch with batch_size * num_beams.");
+  }
 
   DispatchOnType(type_, [&]<typename T>() {
     auto* data = position_ids->GetTensorMutableData<T>();
     for (int64_t dim = 0; dim < 3; ++dim) {
-      for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) {
-        for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) {
-          data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = 
-            static_cast<T>(base_pos + pos);
+      for (int64_t b = 0; b < batch_size; ++b) {
+        for (int64_t s = 0; s < seq_len; ++s) {
+          // From Python: delta = (cache_position[0] + self.rope_deltas)
+          // cache_position[0] is `base_pos`.
+          T delta = static_cast<T>(base_pos + rope_deltas_[b]);
+          // Python: position_ids = position_ids + delta
+          // `position_ids` for new token is just [0, 1, ...]
+          T pos = static_cast<T>(s);
+          data[dim * batch_size * seq_len + b * seq_len + s] = delta + pos;
         }
       }
     }
   });
 
-  position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1);
+  position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1);  // No beam expansion needed, already expanded
   state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor();
 }
 
@@ -631,12 +845,15 @@ void Qwen2VLPositionInputs::Update(DeviceSpan<int32_t> next_tokens, int total_le
       UpdateAttentionMask();
     }
   }
-  
+
   is_first_update_ = false;
 }
 
 void Qwen2VLPositionInputs::RewindTo(size_t index) {
   // For Qwen2-VL, we need to handle rewinding for beam search
+  // This is a simplified rewind, just updating the shape.
+  // A full rewind would require re-calculating rope_deltas if we rewound into the prompt.
+  // For now, we assume rewind only happens during generation.
   if (has_posid_input_) {
     position_ids_shape_[2] = static_cast<int64_t>(index);
   }
@@ -650,7 +867,7 @@ std::unique_ptr<PositionInputs> CreatePositionInputs(State& state, DeviceSpan<in
   if (ModelType::IsQwen2VL(state.model_.config_->model.type)) {
     return std::make_unique<Qwen2VLPositionInputs>(state.model_, state, sequence_lengths);
   }
-  
+
   if (state.model_.config_->model.decoder.sliding_window.has_value() && state.model_.config_->model.decoder.sliding_window->slide_inputs) {
     return std::make_unique<WindowedPositionInputs>(state);
   } else {
@@ -658,4 +875,4 @@ std::unique_ptr<PositionInputs> CreatePositionInputs(State& state, DeviceSpan<in
   }
 }
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
index 867196e008..6bb4d9110d 100644
--- a/src/models/position_inputs.h
+++ b/src/models/position_inputs.h
@@ -109,11 +109,13 @@ struct WindowedPositionInputs : PositionInputs {
   size_t window_index_{};
 };
 
-// Qwen2-VL uses 3D rotary position embeddings for multimodal (vision + text) content.
-// Position IDs have shape [4, batch_size, seq_len] where:
-//   - Dimension 0: Text-only positions
-//   - Dimensions 1-3: Vision positions (temporal, height, width)
-// This class manages rope_deltas caching to maintain correct positional encoding across generation steps.
+// Qwen2-VL uses 3D rotary position embeddings (mrope) for multimodal (vision + text) content.
+// Position IDs have shape [3, batch_size, seq_len] where the 3 dimensions represent:
+//   - Dimensions 0: Temporal position
+//   - Dimensions 1: Height position
+//   - Dimensions 2: Width position
+// For text, all 3 dimensions are identical. For vision, they are distinct.
+// This class implements the logic from `get_rope_index` to build these 3D IDs.
 struct Qwen2VLPositionInputs : PositionInputs {
   Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan<int32_t> sequence_lengths_unk);
   Qwen2VLPositionInputs(const Qwen2VLPositionInputs&) = delete;
@@ -123,14 +125,18 @@ struct Qwen2VLPositionInputs : PositionInputs {
   void Update(DeviceSpan<int32_t> next_tokens, int total_length, int new_length) override;
   void RewindTo(size_t index) override;
 
+  void SetGridTensors(const std::shared_ptr<Tensor>& image_grid_thw,
+                      const std::shared_ptr<Tensor>& video_grid_thw,
+                      const std::shared_ptr<Tensor>& second_per_grid_ts);
+
  private:
   void AddPositionIDs();
   void AddAttentionMask();
-  
+
   template <typename T>
   void CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 3> shape);
   void Update3DPositionIDs(int base_pos);
-  
+
   template <typename T>
   void CreateAndInitializeAttentionMask(DeviceSpan<int32_t> next_tokens, std::array<int64_t, 2> shape);
   void UpdateAttentionMask();
@@ -141,20 +147,33 @@ struct Qwen2VLPositionInputs : PositionInputs {
   size_t mask_input_index_{~0U};
   size_t posid_input_index_{~0U};
 
-  ONNXTensorElementDataType type_;  // Common type for position_ids and attention_mask
+  ONNXTensorElementDataType type_;
 
   bool has_mask_input_{false};
   bool has_posid_input_{false};
 
   std::array<int64_t, 3> position_ids_shape_{};  // {3, batch_size, sequence_length} for 3D positions
   std::unique_ptr<Tensor> position_ids_;
-  
+
   std::array<int64_t, 2> attention_mask_shape_{};  // {batch_size, sequence_length}
   std::unique_ptr<Tensor> attention_mask_;
-  
+
   bool is_first_update_{true};
+
+  // Cached data from processor
+  std::shared_ptr<Tensor> image_grid_thw_;
+  std::shared_ptr<Tensor> video_grid_thw_;
+  std::shared_ptr<Tensor> second_per_grid_ts_;
+  std::vector<int64_t> rope_deltas_;
+
+  // Config values initialized from model.config_ in constructor
+  const int32_t image_token_id_;
+  const int32_t video_token_id_;
+  const int32_t vision_start_token_id_;
+  const float tokens_per_second_;
+  const int32_t spatial_merge_size_;
 };
 
 std::unique_ptr<PositionInputs> CreatePositionInputs(State& state, DeviceSpan<int32_t> sequence_lengths, const std::string& attention_mask_name);
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp
index 198d087488..3cd0190aff 100644
--- a/src/models/qwen_image_processor.cpp
+++ b/src/models/qwen_image_processor.cpp
@@ -10,13 +10,12 @@ namespace Generators {
 
 namespace {
 
-constexpr int64_t kMergeSize = 2;  // Qwen2-VL merge size for vision tokens
-
+// constexpr int64_t kMergeSize = 2;  // Qwen2-VL merge size for vision tokens
 std::tuple<std::unique_ptr<OrtValue>, std::unique_ptr<OrtValue>>
 ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt,
                    OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, 
                    const int64_t* computed_grid_data, int64_t computed_grid_num_images,
-                   Ort::Allocator& allocator) {
+                   Ort::Allocator& allocator, int64_t spatial_merge_size) {
   constexpr char vision_start_token[] = "<|vision_start|>";
   constexpr char vision_end_token[] = "<|vision_end|>";
   constexpr char image_pad_token[] = "<|image_pad|>";
@@ -50,7 +49,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
       int64_t t = image_grid_thw_data[i * 3 + 0];
       int64_t h = image_grid_thw_data[i * 3 + 1];
       int64_t w = image_grid_thw_data[i * 3 + 2];
-      total_image_tokens += (t * h * w) / (kMergeSize * kMergeSize);
+      total_image_tokens += (t * h * w) / (spatial_merge_size * spatial_merge_size);
     }
   }
 
@@ -96,7 +95,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
       int64_t t = image_grid_thw_data[image_idx * 3 + 0];
       int64_t h = image_grid_thw_data[image_idx * 3 + 1];
       int64_t w = image_grid_thw_data[image_idx * 3 + 2];
-      int64_t num_pads = (t * h * w) / (kMergeSize * kMergeSize);
+      int64_t num_pads = (t * h * w) / (spatial_merge_size * spatial_merge_size);
       
       // Add vision_start, image_pad tokens, and vision_end
       modified_text += vision_start_token;
@@ -136,7 +135,8 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
 }  // namespace
 
 QwenImageProcessor::QwenImageProcessor(Config& config, const SessionInfo& session_info)
-    : pixel_values_type_{session_info.GetInputDataType(config.model.vision.inputs.pixel_values)} {
+    : pixel_values_type_{session_info.GetInputDataType(config.model.vision.inputs.pixel_values)},
+      spatial_merge_size_{config.model.vision.spatial_merge_size} {
   const auto processor_config = (config.config_path / fs::path(config.model.vision.config_filename)).string();
   CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str()));
 
@@ -151,7 +151,7 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   auto named_tensors = std::make_unique<NamedTensors>();
 
   if (!images) {
-    [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, nullptr, 0, allocator);
+    [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, nullptr, 0, allocator, spatial_merge_size_);
     named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared<Tensor>(std::move(input_ids)));
     return named_tensors;
   }
@@ -244,7 +244,7 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   }
 
   auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, 
-                                                          image_grid_thw, computed_grid_data, computed_grid_num_images, allocator);
+                                                          image_grid_thw, computed_grid_data, computed_grid_num_images, allocator, spatial_merge_size_);
   named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared<Tensor>(std::move(input_ids)));
 
   // Use patched pixel_values if we computed it, otherwise use processor output
diff --git a/src/models/qwen_image_processor.h b/src/models/qwen_image_processor.h
index a116a2c67c..ce1ba26f0b 100644
--- a/src/models/qwen_image_processor.h
+++ b/src/models/qwen_image_processor.h
@@ -15,6 +15,7 @@ struct QwenImageProcessor : Processor {
   ort_extensions::OrtxObjectPtr<OrtxProcessor> processor_;
 
   ONNXTensorElementDataType pixel_values_type_;
+  int64_t spatial_merge_size_;
 };
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file

From a75bb8b16801aa0ff83c4e5df6a1ace2753d3b11 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 25 Nov 2025 19:44:37 +0000
Subject: [PATCH 7/7] format

---
 src/models/multi_modal.cpp          |  3 +-
 src/models/multi_modal.h            |  4 +-
 src/models/position_inputs.cpp      | 12 +++---
 src/models/qwen_image_processor.cpp | 58 ++++++++++++++---------------
 4 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp
index 113b70b1e4..5abb9f4c2a 100644
--- a/src/models/multi_modal.cpp
+++ b/src/models/multi_modal.cpp
@@ -183,8 +183,7 @@ DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan<int3
                            const GeneratorParams& params)
     : State{params, model},
       model_{model},
-      position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)}
-{
+      position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} {
   inputs_embeds_.Add();
   position_inputs_->Add();
   logits_.Add();
diff --git a/src/models/multi_modal.h b/src/models/multi_modal.h
index 8f17b004b6..771f5be36d 100644
--- a/src/models/multi_modal.h
+++ b/src/models/multi_modal.h
@@ -107,8 +107,8 @@ struct DecoderState : State {
   Embeddings inputs_embeds_{*this, Embeddings::Mode::Input,  // Model input
                             model_.config_->model.decoder.inputs.embeddings};
   std::unique_ptr<PositionInputs> position_inputs_;  // Model input
-  DefaultKeyValueCache kv_cache_{*this};   // Model input
-  Logits logits_{*this};                   // Model output
+  DefaultKeyValueCache kv_cache_{*this};             // Model input
+  Logits logits_{*this};                             // Model output
 };
 
 struct MultiModalPipelineState : State {
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
index 5ed02b751e..4e53b27a08 100644
--- a/src/models/position_inputs.cpp
+++ b/src/models/position_inputs.cpp
@@ -4,7 +4,7 @@
 #include "model_type.h"
 #include <vector>
 #include <numeric>
-#include <cmath> // For std::round
+#include <cmath>  // For std::round
 
 namespace Generators {
 
@@ -574,7 +574,7 @@ void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t>
 
   std::span<const int64_t> video_grid_thw_span;
   if (video_grid_thw_) {
-    video_grid_thw_span = std::span(video_grid_thw_->GetData<int64_t>(), video_grid_thw_->GetElementCount())  ;
+    video_grid_thw_span = std::span(video_grid_thw_->GetData<int64_t>(), video_grid_thw_->GetElementCount());
   }
 
   std::span<const float> second_per_grid_ts_span;
@@ -584,7 +584,7 @@ void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan<int32_t>
       throw std::runtime_error("second_per_grid_ts must be float32.");
     second_per_grid_ts_span = std::span(second_per_grid_ts_->GetData<float>(), second_per_grid_ts_->GetElementCount());
   }
-  
+
   auto input_ids_span = next_tokens.CpuSpan();
   int image_index = 0;
   int video_index = 0;
@@ -767,8 +767,8 @@ void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t>
     for (int64_t s = 0; s < seq_len; ++s) {
       int64_t current_token_idx = b * seq_len + s;
       mask_data[current_token_idx] = (input_ids_span[current_token_idx] == model_.config_->model.pad_token_id)
-                                          ? static_cast<T>(0)
-                                          : static_cast<T>(1);
+                                         ? static_cast<T>(0)
+                                         : static_cast<T>(1);
     }
   }
 
@@ -783,7 +783,7 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int base_pos) {
   // base_pos is cache_position[0]
   auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_);
   int64_t batch_size = position_ids_shape_[1];  // This is already expanded (batch*beams)
-  int64_t seq_len = position_ids_shape_[2];    // This will be 1 for generation
+  int64_t seq_len = position_ids_shape_[2];     // This will be 1 for generation
 
   if (rope_deltas_.size() != batch_size) {
     throw std::runtime_error("rope_deltas size mismatch with batch_size * num_beams.");
diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp
index 3cd0190aff..32a4ca56e1 100644
--- a/src/models/qwen_image_processor.cpp
+++ b/src/models/qwen_image_processor.cpp
@@ -13,7 +13,7 @@ namespace {
 // constexpr int64_t kMergeSize = 2;  // Qwen2-VL merge size for vision tokens
 std::tuple<std::unique_ptr<OrtValue>, std::unique_ptr<OrtValue>>
 ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt,
-                   OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, 
+                   OrtxTensor* pixel_values, OrtxTensor* image_grid_thw,
                    const int64_t* computed_grid_data, int64_t computed_grid_num_images,
                    Ort::Allocator& allocator, int64_t spatial_merge_size) {
   constexpr char vision_start_token[] = "<|vision_start|>";
@@ -23,14 +23,14 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
   int64_t num_images = 0;
   int64_t total_image_tokens = 0;
   const int64_t* image_grid_thw_data = nullptr;
-  
+
   if (pixel_values) {
     const float* pixel_values_data{};
     const int64_t* pixel_values_shape{};
     size_t pixel_values_num_dims;
     CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast<const void**>(&pixel_values_data),
                                   &pixel_values_shape, &pixel_values_num_dims));
-    
+
     // Get image_grid_thw data from either processor output or computed value
     if (image_grid_thw) {
       const int64_t* image_grid_thw_shape{};
@@ -42,7 +42,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
       image_grid_thw_data = computed_grid_data;
       num_images = computed_grid_num_images;
     }
-    
+
     // Calculate total image tokens based on grid dimensions
     // For each image: (temporal * height * width) / (merge_size^2)
     for (int64_t i = 0; i < num_images; ++i) {
@@ -55,7 +55,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
 
   // Generate input_ids with vision tokens
   std::string text = prompt;
-  
+
   // If prompt is empty, add vision markers for each image
   if (text.empty()) {
     for (int64_t i = 0; i < num_images; ++i) {
@@ -72,10 +72,10 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
   const auto vision_start_begin = std::sregex_iterator(text.begin(), text.end(), vision_start_regex);
   const auto vision_start_end = std::sregex_iterator();
   const auto vision_start_tokens = std::distance(vision_start_begin, vision_start_end);
-  
+
   if (num_images != vision_start_tokens) {
-    throw std::runtime_error("Prompt contained " + std::to_string(vision_start_tokens) + 
-                           " vision_start tokens but received " + std::to_string(num_images) + " images.");
+    throw std::runtime_error("Prompt contained " + std::to_string(vision_start_tokens) +
+                             " vision_start tokens but received " + std::to_string(num_images) + " images.");
   }
 
   // For Qwen2-VL, we need to replace vision markers with image_pad tokens
@@ -84,34 +84,34 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
     std::string modified_text;
     size_t last_pos = 0;
     size_t image_idx = 0;
-    
+
     std::smatch match;
     std::string temp_text = text;
     while (std::regex_search(temp_text, match, vision_start_regex)) {
       // Add text before the vision_start token
       modified_text += text.substr(last_pos, match.position() - (last_pos - (text.size() - temp_text.size())));
-      
+
       // Calculate number of image_pad tokens for this image
       int64_t t = image_grid_thw_data[image_idx * 3 + 0];
       int64_t h = image_grid_thw_data[image_idx * 3 + 1];
       int64_t w = image_grid_thw_data[image_idx * 3 + 2];
       int64_t num_pads = (t * h * w) / (spatial_merge_size * spatial_merge_size);
-      
+
       // Add vision_start, image_pad tokens, and vision_end
       modified_text += vision_start_token;
       for (int64_t i = 0; i < num_pads; ++i) {
         modified_text += image_pad_token;
       }
       modified_text += vision_end_token;
-      
+
       last_pos = match.position() + match.length() + (text.size() - temp_text.size());
-      
+
       // Find and skip vision_end token
       size_t vision_end_pos = text.find(vision_end_token, last_pos);
       if (vision_end_pos != std::string::npos) {
         last_pos = vision_end_pos + strlen(vision_end_token);
       }
-      
+
       temp_text = match.suffix();
       image_idx++;
     }
@@ -165,40 +165,40 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
   OrtxTensor* image_grid_thw = nullptr;
   // Try to get image_grid_thw from processor (second output)
   auto status = OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw);
-  
+
   // Get pixel_values data and shape
   const float* pixel_values_data{};
   const int64_t* pixel_values_shape{};
   size_t pixel_values_num_dims;
   CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast<const void**>(&pixel_values_data),
                                 &pixel_values_shape, &pixel_values_num_dims));
-  
+
   // If processor doesn't provide image_grid_thw or patched pixel_values, compute them
   std::unique_ptr<OrtValue> computed_image_grid_thw;
   std::unique_ptr<OrtValue> patched_pixel_values;
   const int64_t* computed_grid_data = nullptr;
   int64_t computed_grid_num_images = 0;
-  
+
   // Check if pixel_values needs patching (shape should be [1, height, width, channels] in HWC format)
   if (pixel_values_num_dims == 4 && pixel_values_shape[0] == 1) {
     constexpr int64_t kPatchSize = 14;
     constexpr int64_t kTemporalPatchSize = 2;
     constexpr int64_t kChannels = 3;
-    
-    int64_t height = pixel_values_shape[1];      // HWC: [batch, height, width, channels]
+
+    int64_t height = pixel_values_shape[1];  // HWC: [batch, height, width, channels]
     int64_t width = pixel_values_shape[2];
     int64_t channels = pixel_values_shape[3];
-    
+
     int64_t height_patches = height / kPatchSize;
     int64_t width_patches = width / kPatchSize;
     int64_t total_patches = height_patches * width_patches;
     int64_t patch_dim = channels * kTemporalPatchSize * kPatchSize * kPatchSize;
-    
+
     // Create patched pixel_values: [total_patches, patch_dim]
     patched_pixel_values = OrtValue::CreateTensor<float>(
         allocator, std::vector<int64_t>{total_patches, patch_dim});
     auto* patched_data = patched_pixel_values->GetTensorMutableData<float>();
-    
+
     // Extract patches from single image in HWC format
     // Each spatial patch is replicated kTemporalPatchSize times
     int64_t patch_idx = 0;
@@ -206,9 +206,9 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
       for (int64_t pw = 0; pw < width_patches; ++pw) {
         int64_t h_start = ph * kPatchSize;
         int64_t w_start = pw * kPatchSize;
-        
+
         int64_t write_idx = patch_idx * patch_dim;
-        
+
         // Repeat the same spatial patch kTemporalPatchSize times
         // Output: [temporal, channels, patch_h, patch_w]
         for (int64_t t = 0; t < kTemporalPatchSize; ++t) {
@@ -225,26 +225,26 @@ std::unique_ptr<NamedTensors> QwenImageProcessor::Process(const Tokenizer& token
         patch_idx++;
       }
     }
-    
+
     // Create image_grid_thw: [1, 3] for single image
     if (status != kOrtxOK || !image_grid_thw) {
       computed_image_grid_thw = OrtValue::CreateTensor<int64_t>(
           allocator, std::vector<int64_t>{1, 3});
       auto* grid_data = computed_image_grid_thw->GetTensorMutableData<int64_t>();
-      
+
       // For a single image: T=1 (one frame), H=height_patches, W=width_patches
       // The kTemporalPatchSize is embedded in the patch dimension
       grid_data[0] = 1;  // Single temporal frame for images
       grid_data[1] = height_patches;
       grid_data[2] = width_patches;
-      
+
       computed_grid_data = grid_data;
       computed_grid_num_images = 1;
     }
   }
 
-  auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, 
-                                                          image_grid_thw, computed_grid_data, computed_grid_num_images, allocator, spatial_merge_size_);
+  auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values,
+                                                        image_grid_thw, computed_grid_data, computed_grid_num_images, allocator, spatial_merge_size_);
   named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared<Tensor>(std::move(input_ids)));
 
   // Use patched pixel_values if we computed it, otherwise use processor output