From 2dbe74192790e8f5f30ca7fcc4c6182504f3c786 Mon Sep 17 00:00:00 2001 From: apsonawane Date: Thu, 13 Nov 2025 02:25:43 +0000 Subject: [PATCH 1/7] get_rope_index changes for qwen model --- src/models/model_type.h | 7 +- src/models/multi_modal.cpp | 6 +- src/models/multi_modal.h | 2 +- src/models/position_inputs.cpp | 226 +++++++++++++++++++++++++++++++++ src/models/position_inputs.h | 49 +++++++ 5 files changed, 285 insertions(+), 5 deletions(-) diff --git a/src/models/model_type.h b/src/models/model_type.h index c7c4d2f691..77d8cba665 100644 --- a/src/models/model_type.h +++ b/src/models/model_type.h @@ -18,10 +18,15 @@ struct ModelType { inline static bool IsVLM(const std::string& model_type) { // Vision-language model (VLM) - static constexpr std::array VLM = {"gemma3", "phi3v"}; + static constexpr std::array VLM = {"gemma3", "phi3v", "qwen2vl"}; return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end(); } + inline static bool IsQwen2VL(const std::string& model_type) { + // Qwen2-VL specific check for 3D position IDs + return model_type == "qwen2vl"; + } + inline static bool IsALM(const std::string& model_type) { // Audio-language model (ALM) static constexpr std::array ALM = {"whisper"}; diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp index 56e55b1552..48ad8d5fcf 100644 --- a/src/models/multi_modal.cpp +++ b/src/models/multi_modal.cpp @@ -181,9 +181,9 @@ DeviceSpan EmbeddingState::Run(int current_length, DeviceSpan& n DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan sequence_lengths, const GeneratorParams& params) : State{params, model}, model_{model}, - position_inputs_{model, *this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask} { + position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} { inputs_embeds_.Add(); - position_inputs_.Add(); + position_inputs_->Add(); logits_.Add(); kv_cache_.Add(); } @@ -201,7 +201,7 @@ DeviceSpan DecoderState::Run(int current_length, DeviceSpan& nex void DecoderState::UpdateInputsOutputs(DeviceSpan& next_tokens, int total_length, DeviceSpan beam_indices) { int batch_size = static_cast(inputs_embeds_.GetShape()[0]); size_t new_length = next_tokens.size() / batch_size; - position_inputs_.Update(next_tokens, total_length, static_cast(new_length)); + position_inputs_->Update(next_tokens, total_length, static_cast(new_length)); kv_cache_.Update(beam_indices, total_length); logits_.Update(next_tokens, new_length); inputs_embeds_.UpdateSequenceLength(new_length); diff --git a/src/models/multi_modal.h b/src/models/multi_modal.h index 206fc3850b..0cbe4e527b 100644 --- a/src/models/multi_modal.h +++ b/src/models/multi_modal.h @@ -105,7 +105,7 @@ struct DecoderState : State { const MultiModalLanguageModel& model_; Embeddings inputs_embeds_{*this, Embeddings::Mode::Input, // Model input model_.config_->model.decoder.inputs.embeddings}; - DefaultPositionInputs position_inputs_; // Model input + std::unique_ptr position_inputs_; // Model input DefaultKeyValueCache kv_cache_{*this}; // Model input Logits logits_{*this}; // Model output }; diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index d87a8c6b64..daccf9932d 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -1,6 +1,7 @@ #include "../generators.h" #include "model.h" #include "position_inputs.h" +#include "model_type.h" namespace Generators { @@ -477,7 +478,232 @@ void WindowedPositionInputs::Update(DeviceSpan next_tokens, int total_l window_index_++; } +// Qwen2VLPositionInputs implementation +Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan sequence_lengths_unk) + : model_{model}, + state_{state} { + has_mask_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.attention_mask); + has_posid_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.position_ids); + + type_ = Ort::TypeToTensorType; + if (has_mask_input_) { + type_ = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.attention_mask); + } + if (has_posid_input_) { + if (has_mask_input_) { + if (model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids) != type_) { + throw std::runtime_error("position_ids & attention_mask must have the same data type"); + } + } + // Set up 3D position IDs shape: [4, batch_size, sequence_length] + position_ids_shape_[0] = 4; // 4 dimensions: text + 3D vision (temporal, height, width) + position_ids_shape_[1] = state_.params_->search.batch_size; + position_ids_shape_[2] = 0; // Will be set during first update + + position_ids_ = std::make_unique(); + position_ids_next_ = std::make_unique(); + } + if (has_mask_input_) { + attention_mask_shape_[0] = state_.params_->search.batch_size; + attention_mask_shape_[1] = 0; // Will be set during first update + attention_mask_ = std::make_unique(); + attention_mask_next_ = std::make_unique(); + } +} + +void Qwen2VLPositionInputs::Add() { + if (has_posid_input_) { + AddPositionIDs(); + } + if (has_mask_input_) { + AddAttentionMask(); + } +} + +void Qwen2VLPositionInputs::AddPositionIDs() { + posid_input_index_ = state_.inputs_.size(); + state_.inputs_.push_back(position_ids_->GetOrtTensor()); + state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str()); +} + +void Qwen2VLPositionInputs::AddAttentionMask() { + mask_input_index_ = state_.inputs_.size(); + state_.inputs_.push_back(attention_mask_->GetOrtTensor()); + state_.input_names_.push_back(model_.config_->model.decoder.inputs.attention_mask.c_str()); +} + +template +void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape) { + // For Qwen2-VL, in the prefill stage, position_ids are [4, batch_size, seq_len] + // During generation, they remain [4, batch_size, 1] + // The 4 dimensions are: [text_positions, temporal_positions, height_positions, width_positions] + + auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); + auto* position_data = position_ids->GetTensorMutableData(); + + auto position_ids_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array{shape[0], shape[1], 1}, type_); + auto* position_data_next = position_ids_next->GetTensorMutableData(); + + // Initialize position IDs + // For text-only content (no vision), all 4 dimensions have the same position values + // This matches the behavior in transformers where text positions are replicated across dimensions + if (shape[1] == 1) { + // Single batch, simple case + for (int64_t dim = 0; dim < 4; ++dim) { + for (int64_t i = 0; i < shape[2]; ++i) { + position_data[dim * shape[1] * shape[2] + i] = static_cast(i); + } + } + // Initialize next tensor with the last position + 1 + for (int64_t dim = 0; dim < 4; ++dim) { + position_data_next[dim * shape[1] + 0] = static_cast(shape[2]); + } + } else { + // Multiple batches - initialize with simple ascending values + // In practice, vision-specific positions would be computed by the model's get_rope_index logic + for (int64_t dim = 0; dim < 4; ++dim) { + for (int64_t batch = 0; batch < shape[1]; ++batch) { + for (int64_t pos = 0; pos < shape[2]; ++pos) { + position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast(pos); + } + position_data_next[dim * shape[1] + batch] = static_cast(shape[2]); + } + } + } + + // Move tensors to appropriate device and expand by num_beams + position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, state_.params_->search.num_beams); + position_ids_next_->ort_tensor_ = model_.ExpandInputs(position_ids_next, state_.params_->search.num_beams); + if (state_.params_->use_graph_capture) + position_ids_next_->MakeStatic(); + position_ids_shape_[1] *= state_.params_->search.num_beams; + state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); +} + +template +void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan next_tokens, std::array shape) { + // Standard 2D attention mask initialization + auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); + auto* mask_data = attention_mask->GetTensorMutableData(); + + auto attention_mask_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array{shape[0], shape[1] + 1}, type_); + auto* mask_data_next = attention_mask_next->GetTensorMutableData(); + + // Set mask to 1 for all positions (assuming no padding in first iteration) + std::fill_n(mask_data, shape[0] * shape[1], static_cast(1)); + std::fill_n(mask_data_next, shape[0] * (shape[1] + 1), static_cast(1)); + + // Move tensors to device and expand by num_beams + attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams); + attention_mask_next_->ort_tensor_ = model_.ExpandInputs(attention_mask_next, state_.params_->search.num_beams); + if (state_.params_->use_graph_capture) + attention_mask_next_->MakeStatic(); + attention_mask_shape_[0] *= state_.params_->search.num_beams; + state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor(); +} + +void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length) { + // After first update, we use the cached position_ids_next tensor + if (position_ids_next_ && position_ids_shape_[1] > 1 && position_ids_shape_[2] == 1) { + position_ids_ = std::move(position_ids_next_); + position_ids_next_ = nullptr; + } else { + position_ids_->CreateTensor(position_ids_shape_, state_.params_->use_graph_capture && position_ids_shape_[2] == 1); + } + + // Update position values for generation phase + // During generation, we increment all 4 dimensions uniformly for text generation + if (type_ == Ort::TypeToTensorType) { + auto* data = position_ids_->GetTensorMutableData(); + for (int64_t dim = 0; dim < 4; ++dim) { + for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { + for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { + data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = + static_cast(total_length - new_length + pos); + } + } + } + } else { + auto* data = position_ids_->GetTensorMutableData(); + for (int64_t dim = 0; dim < 4; ++dim) { + for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { + for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { + data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = + static_cast(total_length - new_length + pos); + } + } + } + } + + state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); +} + +void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length) { + if (attention_mask_next_ && attention_mask_shape_[1] == total_length - 1) { + attention_mask_ = std::move(attention_mask_next_); + attention_mask_next_ = nullptr; + } else { + attention_mask_->CreateTensor(attention_mask_shape_, state_.params_->use_graph_capture && attention_mask_shape_[1] == 1); + } + + if (!state_.params_->use_graph_capture || attention_mask_shape_[1] != 1) { + // Update attention mask - typically all 1s during generation + if (type_ == Ort::TypeToTensorType) { + auto* mask_data = attention_mask_->GetTensorMutableData(); + std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); + } else { + auto* mask_data = attention_mask_->GetTensorMutableData(); + std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); + } + } + + state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor(); +} + +void Qwen2VLPositionInputs::Update(DeviceSpan next_tokens, int total_length, int new_length) { + if (has_posid_input_) { + if (is_first_update_) { + position_ids_shape_[2] = new_length; + if (type_ == Ort::TypeToTensorType) + CreateAndInitialize3DPositionIDs(next_tokens, position_ids_shape_); + else + CreateAndInitialize3DPositionIDs(next_tokens, position_ids_shape_); + } else { + Update3DPositionIDs(total_length, new_length); + } + } + + if (has_mask_input_) { + if (is_first_update_) { + attention_mask_shape_[1] = new_length; + if (type_ == Ort::TypeToTensorType) + CreateAndInitializeAttentionMask(next_tokens, attention_mask_shape_); + else + CreateAndInitializeAttentionMask(next_tokens, attention_mask_shape_); + } else { + UpdateAttentionMask(total_length, new_length); + } + } + + is_first_update_ = false; +} + +void Qwen2VLPositionInputs::RewindTo(size_t index) { + // For Qwen2-VL, we need to handle rewinding for beam search + if (has_posid_input_) { + position_ids_shape_[2] = static_cast(index); + } + if (has_mask_input_) { + attention_mask_shape_[1] = static_cast(index); + } +} + std::unique_ptr CreatePositionInputs(State& state, DeviceSpan sequence_lengths, const std::string& attention_mask_name) { + // Check for Qwen2-VL model type which requires 3D position IDs + if (ModelType::IsQwen2VL(state.model_.config_->model.type)) { + return std::make_unique(state.model_, state, sequence_lengths); + } + if (state.model_.config_->model.decoder.sliding_window.has_value() && state.model_.config_->model.decoder.sliding_window->slide_inputs) { return std::make_unique(state); } else { diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h index 5095bf40fc..644f0f3a63 100644 --- a/src/models/position_inputs.h +++ b/src/models/position_inputs.h @@ -109,6 +109,55 @@ struct WindowedPositionInputs : PositionInputs { size_t window_index_{}; }; +// Qwen2-VL uses 3D rotary position embeddings for multimodal (vision + text) content. +// Position IDs have shape [4, batch_size, seq_len] where: +// - Dimension 0: Text-only positions +// - Dimensions 1-3: Vision positions (temporal, height, width) +// This class manages rope_deltas caching to maintain correct positional encoding across generation steps. +struct Qwen2VLPositionInputs : PositionInputs { + Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan sequence_lengths_unk); + Qwen2VLPositionInputs(const Qwen2VLPositionInputs&) = delete; + Qwen2VLPositionInputs& operator=(const Qwen2VLPositionInputs&) = delete; + + void Add() override; + void Update(DeviceSpan next_tokens, int total_length, int new_length) override; + void RewindTo(size_t index) override; + + private: + void AddPositionIDs(); + void AddAttentionMask(); + + template + void CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape); + void Update3DPositionIDs(int total_length, int new_length); + + template + void CreateAndInitializeAttentionMask(DeviceSpan next_tokens, std::array shape); + void UpdateAttentionMask(int total_length, int new_length); + + const Model& model_; + State& state_; + + size_t mask_input_index_{~0U}; + size_t posid_input_index_{~0U}; + + ONNXTensorElementDataType type_; // Common type for position_ids and attention_mask + + bool has_mask_input_{false}; + bool has_posid_input_{false}; + + std::array position_ids_shape_{}; // {4, batch_size, sequence_length} for 3D positions + std::unique_ptr position_ids_; + std::unique_ptr position_ids_next_; // Replaces position_ids_ after the first Run() call + + std::array attention_mask_shape_{}; // {batch_size, sequence_length} + std::unique_ptr attention_mask_; + std::unique_ptr attention_mask_next_; // Replaces attention_mask_ after each run + + std::unique_ptr rope_deltas_; // Cached rope deltas for position calculation + bool is_first_update_{true}; +}; + std::unique_ptr CreatePositionInputs(State& state, DeviceSpan sequence_lengths, const std::string& attention_mask_name); } // namespace Generators From be5efbb98f8a10e6b255dfc9cd7aaca1fa2fcdeb Mon Sep 17 00:00:00 2001 From: apsonawane Date: Thu, 13 Nov 2025 23:46:01 +0000 Subject: [PATCH 2/7] Add text only support --- src/config.cpp | 2 + src/config.h | 2 + src/models/model.cpp | 4 +- src/models/model.h | 1 + src/models/model_type.h | 4 +- src/models/position_inputs.cpp | 78 +++++++----- src/models/qwen_image_processor.cpp | 189 ++++++++++++++++++++++++++++ src/models/qwen_image_processor.h | 20 +++ 8 files changed, 263 insertions(+), 37 deletions(-) create mode 100644 src/models/qwen_image_processor.cpp create mode 100644 src/models/qwen_image_processor.h diff --git a/src/config.cpp b/src/config.cpp index e17e9c21f2..9a9d48ab9b 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -615,6 +615,8 @@ struct VisionInputs_Element : JSON::Element { v_.pixel_values = JSON::Get(value); } else if (name == "image_sizes") { v_.image_sizes = JSON::Get(value); + } else if (name == "image_grid_thw") { + v_.image_grid_thw = JSON::Get(value); } else if (name == "attention_mask") { v_.attention_mask = JSON::Get(value); } else { diff --git a/src/config.h b/src/config.h index 507d7c80c1..76b3d4c241 100644 --- a/src/config.h +++ b/src/config.h @@ -38,6 +38,7 @@ struct Config { // Vision encoder names static constexpr std::string_view PixelValuesName = "pixel_values"; static constexpr std::string_view ImageSizesName = "image_sizes"; + static constexpr std::string_view ImageGridThwName = "image_grid_thw"; static constexpr std::string_view ImageAttentionMaskName = "image_attention_mask"; static constexpr std::string_view ImageFeaturesName = "image_features"; static constexpr std::string_view NumImageTokens = "num_image_tokens"; @@ -162,6 +163,7 @@ struct Config { struct Inputs { std::string pixel_values{Defaults::PixelValuesName}; std::string image_sizes{Defaults::ImageSizesName}; + std::string image_grid_thw{Defaults::ImageGridThwName}; std::string attention_mask{Defaults::ImageAttentionMaskName}; // image attention mask } inputs; diff --git a/src/models/model.cpp b/src/models/model.cpp index 92206aff3b..97e5e7cc5e 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -1286,7 +1286,9 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess {"phi3v", Processor::Create}, {"whisper", Processor::Create}, {"phi4mm", Processor::Create}, - {"gemma3", Processor::Create}} { + {"gemma3", Processor::Create}, + {"qwen2vl", Processor::Create}, + {"qwen2_5_vl", Processor::Create}} { auto processor = processor_factory_.find(config.model.type); if (processor != processor_factory_.end()) { processor_ = processor->second(config, session_info); diff --git a/src/models/model.h b/src/models/model.h index 7faa1000fe..0e50059702 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -9,6 +9,7 @@ #include "whisper_processor.h" #include "phi_multimodal_processor.h" #include "gemma_image_processor.h" +#include "qwen_image_processor.h" #include "adapters.h" #include "extra_outputs.h" diff --git a/src/models/model_type.h b/src/models/model_type.h index 77d8cba665..0094ab2a11 100644 --- a/src/models/model_type.h +++ b/src/models/model_type.h @@ -18,13 +18,13 @@ struct ModelType { inline static bool IsVLM(const std::string& model_type) { // Vision-language model (VLM) - static constexpr std::array VLM = {"gemma3", "phi3v", "qwen2vl"}; + static constexpr std::array VLM = {"gemma3", "phi3v", "qwen2vl", "qwen2_5_vl"}; return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end(); } inline static bool IsQwen2VL(const std::string& model_type) { // Qwen2-VL specific check for 3D position IDs - return model_type == "qwen2vl"; + return model_type == "qwen2vl" || model_type == "qwen2_5_vl"; } inline static bool IsALM(const std::string& model_type) { diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index daccf9932d..336a51647a 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -485,29 +485,29 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D has_mask_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.attention_mask); has_posid_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.position_ids); - type_ = Ort::TypeToTensorType; + type_ = Ort::TypeToTensorType; // Default to int64 for Qwen2VL if (has_mask_input_) { type_ = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.attention_mask); } + + ONNXTensorElementDataType posid_type = type_; if (has_posid_input_) { - if (has_mask_input_) { - if (model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids) != type_) { - throw std::runtime_error("position_ids & attention_mask must have the same data type"); - } - } - // Set up 3D position IDs shape: [4, batch_size, sequence_length] - position_ids_shape_[0] = 4; // 4 dimensions: text + 3D vision (temporal, height, width) + posid_type = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids); + + // Set up 3D position IDs shape: [3, batch_size, sequence_length] + // The 3 dimensions represent temporal, height, and width for mrope + position_ids_shape_[0] = 3; position_ids_shape_[1] = state_.params_->search.batch_size; position_ids_shape_[2] = 0; // Will be set during first update - position_ids_ = std::make_unique(); - position_ids_next_ = std::make_unique(); + position_ids_ = std::make_unique(model_.p_device_inputs_, posid_type); + position_ids_next_ = std::make_unique(model_.p_device_inputs_, posid_type); } if (has_mask_input_) { attention_mask_shape_[0] = state_.params_->search.batch_size; attention_mask_shape_[1] = 0; // Will be set during first update - attention_mask_ = std::make_unique(); - attention_mask_next_ = std::make_unique(); + attention_mask_ = std::make_unique(model_.p_device_inputs_, type_); + attention_mask_next_ = std::make_unique(model_.p_device_inputs_, type_); } } @@ -534,9 +534,9 @@ void Qwen2VLPositionInputs::AddAttentionMask() { template void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape) { - // For Qwen2-VL, in the prefill stage, position_ids are [4, batch_size, seq_len] - // During generation, they remain [4, batch_size, 1] - // The 4 dimensions are: [text_positions, temporal_positions, height_positions, width_positions] + // For Qwen2-VL, in the prefill stage, position_ids are [3, batch_size, seq_len] + // During generation, they remain [3, batch_size, 1] + // The 3 dimensions represent: [temporal, height, width] for mrope auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); auto* position_data = position_ids->GetTensorMutableData(); @@ -545,23 +545,33 @@ void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan auto* position_data_next = position_ids_next->GetTensorMutableData(); // Initialize position IDs - // For text-only content (no vision), all 4 dimensions have the same position values - // This matches the behavior in transformers where text positions are replicated across dimensions - if (shape[1] == 1) { - // Single batch, simple case - for (int64_t dim = 0; dim < 4; ++dim) { - for (int64_t i = 0; i < shape[2]; ++i) { - position_data[dim * shape[1] * shape[2] + i] = static_cast(i); + // For text-only content (no vision), all 3 dimensions have the same position values + // This matches the PyTorch get_rope_index behavior where text positions are [0,1,2,...] + // replicated across all 3 mrope dimensions + + // Fill position_ids: shape is [3, batch_size, seq_len] + for (int64_t dim = 0; dim < 3; ++dim) { + for (int64_t batch = 0; batch < shape[1]; ++batch) { + for (int64_t pos = 0; pos < shape[2]; ++pos) { + // All 3 dimensions get the same sequential position values for text + position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast(pos); } } - // Initialize next tensor with the last position + 1 - for (int64_t dim = 0; dim < 4; ++dim) { - position_data_next[dim * shape[1] + 0] = static_cast(shape[2]); + } + + // Fill position_ids_next for generation: shape is [3, batch_size, 1] + for (int64_t dim = 0; dim < 3; ++dim) { + for (int64_t batch = 0; batch < shape[1]; ++batch) { + // Next position is seq_len (continuing from last position) + position_data_next[dim * shape[1] + batch] = static_cast(shape[2]); } - } else { + } + + // Old multi-batch code removed since we simplified to match PyTorch logic + if (false) { // Multiple batches - initialize with simple ascending values // In practice, vision-specific positions would be computed by the model's get_rope_index logic - for (int64_t dim = 0; dim < 4; ++dim) { + for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < shape[1]; ++batch) { for (int64_t pos = 0; pos < shape[2]; ++pos) { position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast(pos); @@ -612,10 +622,10 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length } // Update position values for generation phase - // During generation, we increment all 4 dimensions uniformly for text generation + // During generation, we increment all 3 dimensions uniformly for text generation if (type_ == Ort::TypeToTensorType) { - auto* data = position_ids_->GetTensorMutableData(); - for (int64_t dim = 0; dim < 4; ++dim) { + auto* data = position_ids_->GetMutableData(); + for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = @@ -624,8 +634,8 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length } } } else { - auto* data = position_ids_->GetTensorMutableData(); - for (int64_t dim = 0; dim < 4; ++dim) { + auto* data = position_ids_->GetMutableData(); + for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = @@ -649,10 +659,10 @@ void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length if (!state_.params_->use_graph_capture || attention_mask_shape_[1] != 1) { // Update attention mask - typically all 1s during generation if (type_ == Ort::TypeToTensorType) { - auto* mask_data = attention_mask_->GetTensorMutableData(); + auto* mask_data = attention_mask_->GetMutableData(); std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); } else { - auto* mask_data = attention_mask_->GetTensorMutableData(); + auto* mask_data = attention_mask_->GetMutableData(); std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); } } diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp new file mode 100644 index 0000000000..75c46790b1 --- /dev/null +++ b/src/models/qwen_image_processor.cpp @@ -0,0 +1,189 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "../generators.h" +#include "model.h" + +#include + +namespace Generators { + +namespace { + +std::tuple, std::unique_ptr> +ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt, + OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, Ort::Allocator& allocator) { + constexpr char vision_start_token[] = "<|vision_start|>"; + constexpr char vision_end_token[] = "<|vision_end|>"; + constexpr char image_pad_token[] = "<|image_pad|>"; + + int64_t num_images = 0; + int64_t total_image_tokens = 0; + + if (pixel_values && image_grid_thw) { + const float* pixel_values_data{}; + const int64_t* pixel_values_shape{}; + size_t pixel_values_num_dims; + CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast(&pixel_values_data), + &pixel_values_shape, &pixel_values_num_dims)); + + const int64_t* image_grid_thw_data{}; + const int64_t* image_grid_thw_shape{}; + size_t image_grid_thw_num_dims; + CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast(&image_grid_thw_data), + &image_grid_thw_shape, &image_grid_thw_num_dims)); + + num_images = image_grid_thw_shape[0]; + + // Calculate total image tokens based on grid dimensions + // For each image: (temporal * height * width) / (merge_size^2) + constexpr int64_t merge_size = 2; + for (int64_t i = 0; i < num_images; ++i) { + int64_t t = image_grid_thw_data[i * 3 + 0]; + int64_t h = image_grid_thw_data[i * 3 + 1]; + int64_t w = image_grid_thw_data[i * 3 + 2]; + total_image_tokens += (t * h * w) / (merge_size * merge_size); + } + } + + // Generate input_ids with vision tokens + std::string text = prompt; + + // If prompt is empty, add vision markers for each image + if (text.empty()) { + for (int64_t i = 0; i < num_images; ++i) { + text += std::string(vision_start_token) + " " + std::string(vision_end_token); + if (i < num_images - 1) { + text += " "; + } + } + } + + // Count the number of vision_start tokens and make sure it matches the number of images + const std::regex vision_start_regex{std::string(vision_start_token)}; + const auto vision_start_begin = std::sregex_iterator(text.begin(), text.end(), vision_start_regex); + const auto vision_start_end = std::sregex_iterator(); + const auto vision_start_tokens = std::distance(vision_start_begin, vision_start_end); + + if (num_images != vision_start_tokens) { + throw std::runtime_error("Prompt contained " + std::to_string(vision_start_tokens) + + " vision_start tokens but received " + std::to_string(num_images) + " images."); + } + + // For Qwen2-VL, we need to replace vision markers with image_pad tokens + // The number of image_pad tokens for each image depends on the image dimensions + if (num_images > 0 && image_grid_thw) { + const int64_t* image_grid_thw_data{}; + const int64_t* image_grid_thw_shape{}; + size_t image_grid_thw_num_dims; + CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast(&image_grid_thw_data), + &image_grid_thw_shape, &image_grid_thw_num_dims)); + + constexpr int64_t merge_size = 2; + std::string modified_text; + size_t last_pos = 0; + size_t image_idx = 0; + + std::smatch match; + std::string temp_text = text; + while (std::regex_search(temp_text, match, vision_start_regex)) { + // Add text before the vision_start token + modified_text += text.substr(last_pos, match.position() - (last_pos - (text.size() - temp_text.size()))); + + // Calculate number of image_pad tokens for this image + int64_t t = image_grid_thw_data[image_idx * 3 + 0]; + int64_t h = image_grid_thw_data[image_idx * 3 + 1]; + int64_t w = image_grid_thw_data[image_idx * 3 + 2]; + int64_t num_pads = (t * h * w) / (merge_size * merge_size); + + // Add vision_start, image_pad tokens, and vision_end + modified_text += vision_start_token; + for (int64_t i = 0; i < num_pads; ++i) { + modified_text += image_pad_token; + } + modified_text += vision_end_token; + + last_pos = match.position() + match.length() + (text.size() - temp_text.size()); + + // Find and skip vision_end token + size_t vision_end_pos = text.find(vision_end_token, last_pos); + if (vision_end_pos != std::string::npos) { + last_pos = vision_end_pos + strlen(vision_end_token); + } + + temp_text = match.suffix(); + image_idx++; + } + modified_text += text.substr(last_pos); + text = modified_text; + } + + const std::vector input_ids = tokenizer.Encode(text.c_str()); + + std::unique_ptr input_ids_value = OrtValue::CreateTensor( + allocator, std::vector{1, static_cast(input_ids.size())}); + std::copy(input_ids.begin(), input_ids.end(), input_ids_value->GetTensorMutableData()); + + std::unique_ptr num_img_tokens = OrtValue::CreateTensor( + allocator, std::vector{1}); + num_img_tokens->GetTensorMutableData()[0] = total_image_tokens; + + return {std::move(input_ids_value), std::move(num_img_tokens)}; +} + +} // namespace + +QwenImageProcessor::QwenImageProcessor(Config& config, const SessionInfo& session_info) + : pixel_values_type_{session_info.GetInputDataType(config.model.vision.inputs.pixel_values)} { + const auto processor_config = (config.config_path / fs::path(config.model.vision.config_filename)).string(); + CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str())); + + config.AddMapping(std::string(Config::Defaults::InputIdsName), config.model.embedding.inputs.input_ids); + config.AddMapping(std::string(Config::Defaults::PixelValuesName), config.model.vision.inputs.pixel_values); +} + +std::unique_ptr QwenImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const { + std::string prompt = std::string(payload.prompt); + const Images* images = payload.images; + Ort::Allocator& allocator{Ort::Allocator::GetWithDefaultOptions()}; + auto named_tensors = std::make_unique(); + + if (!images) { + [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, allocator); + named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared(std::move(input_ids))); + return named_tensors; + } + + ort_extensions::OrtxObjectPtr result; + CheckResult(OrtxImagePreProcess(processor_.get(), images->images_.get(), result.ToBeAssigned())); + + OrtxTensor* pixel_values = nullptr; + CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values)); + + OrtxTensor* image_grid_thw = nullptr; + CheckResult(OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw)); + + auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, image_grid_thw, allocator); + named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared(std::move(input_ids))); + + if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), + std::make_shared(ProcessTensor(pixel_values, allocator))); + } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) { + named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), + std::make_shared(ProcessTensor(pixel_values, allocator))); + } else { + named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), + std::make_shared(ProcessTensor(pixel_values, allocator))); + } + + // Add image_grid_thw tensor + named_tensors->emplace("image_grid_thw", + std::make_shared(ProcessTensor(image_grid_thw, allocator))); + + named_tensors->emplace(std::string(Config::Defaults::NumImageTokens), std::make_shared(std::move(num_img_tokens))); + + return named_tensors; +} + +} // namespace Generators diff --git a/src/models/qwen_image_processor.h b/src/models/qwen_image_processor.h new file mode 100644 index 0000000000..a116a2c67c --- /dev/null +++ b/src/models/qwen_image_processor.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once + +#include "processor.h" + +namespace Generators { + +struct QwenImageProcessor : Processor { + QwenImageProcessor(Config& config, const SessionInfo& session_info); + + virtual std::unique_ptr Process(const Tokenizer& tokenizer, const Payload& payload) const override; + + private: + ort_extensions::OrtxObjectPtr processor_; + + ONNXTensorElementDataType pixel_values_type_; +}; + +} // namespace Generators From 8aceabd3cec0172ec1c343c9d8ce46246f17c6af Mon Sep 17 00:00:00 2001 From: apsonawane Date: Sat, 15 Nov 2025 01:40:50 +0000 Subject: [PATCH 3/7] Running pipeline --- cmake/deps.txt | 2 +- src/models/position_inputs.cpp | 43 ++++--- src/models/qwen_image_processor.cpp | 171 ++++++++++++++++++++++++---- 3 files changed, 168 insertions(+), 48 deletions(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index 8e4f97ebb8..fc0530b187 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e -onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;97083215f9c84189ad2484d5c933cc06086e9073 +onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;301b442d8f903daba129e825cd446755b840abb0 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE) llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 336a51647a..0787e34691 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -613,18 +613,13 @@ void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan } void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length) { - // After first update, we use the cached position_ids_next tensor - if (position_ids_next_ && position_ids_shape_[1] > 1 && position_ids_shape_[2] == 1) { - position_ids_ = std::move(position_ids_next_); - position_ids_next_ = nullptr; - } else { - position_ids_->CreateTensor(position_ids_shape_, state_.params_->use_graph_capture && position_ids_shape_[2] == 1); - } + // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs) + auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_); // Update position values for generation phase // During generation, we increment all 3 dimensions uniformly for text generation if (type_ == Ort::TypeToTensorType) { - auto* data = position_ids_->GetMutableData(); + auto* data = position_ids->GetTensorMutableData(); for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { @@ -634,7 +629,7 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length } } } else { - auto* data = position_ids_->GetMutableData(); + auto* data = position_ids->GetTensorMutableData(); for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { @@ -645,27 +640,26 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length } } + // Move to GPU if needed + position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1); state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); } void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length) { - if (attention_mask_next_ && attention_mask_shape_[1] == total_length - 1) { - attention_mask_ = std::move(attention_mask_next_); - attention_mask_next_ = nullptr; + // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs) + auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, attention_mask_shape_, type_); + + // Update attention mask - typically all 1s during generation + if (type_ == Ort::TypeToTensorType) { + auto* mask_data = attention_mask->GetTensorMutableData(); + std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); } else { - attention_mask_->CreateTensor(attention_mask_shape_, state_.params_->use_graph_capture && attention_mask_shape_[1] == 1); + auto* mask_data = attention_mask->GetTensorMutableData(); + std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); } - if (!state_.params_->use_graph_capture || attention_mask_shape_[1] != 1) { - // Update attention mask - typically all 1s during generation - if (type_ == Ort::TypeToTensorType) { - auto* mask_data = attention_mask_->GetMutableData(); - std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); - } else { - auto* mask_data = attention_mask_->GetMutableData(); - std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); - } - } + // Move to GPU if needed + attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, 1); state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor(); } @@ -679,6 +673,7 @@ void Qwen2VLPositionInputs::Update(DeviceSpan next_tokens, int total_le else CreateAndInitialize3DPositionIDs(next_tokens, position_ids_shape_); } else { + position_ids_shape_[2] = new_length; // Update shape before Update3DPositionIDs Update3DPositionIDs(total_length, new_length); } } @@ -691,7 +686,9 @@ void Qwen2VLPositionInputs::Update(DeviceSpan next_tokens, int total_le else CreateAndInitializeAttentionMask(next_tokens, attention_mask_shape_); } else { + // UpdateAttentionMask checks old shape, then we update it UpdateAttentionMask(total_length, new_length); + attention_mask_shape_[1] = total_length; // Update to current total length } } diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp index 75c46790b1..2831f9cbc1 100644 --- a/src/models/qwen_image_processor.cpp +++ b/src/models/qwen_image_processor.cpp @@ -12,28 +12,35 @@ namespace { std::tuple, std::unique_ptr> ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt, - OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, Ort::Allocator& allocator) { + OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, + const int64_t* computed_grid_data, int64_t computed_grid_num_images, + Ort::Allocator& allocator) { constexpr char vision_start_token[] = "<|vision_start|>"; constexpr char vision_end_token[] = "<|vision_end|>"; constexpr char image_pad_token[] = "<|image_pad|>"; int64_t num_images = 0; int64_t total_image_tokens = 0; + const int64_t* image_grid_thw_data = nullptr; - if (pixel_values && image_grid_thw) { + if (pixel_values) { const float* pixel_values_data{}; const int64_t* pixel_values_shape{}; size_t pixel_values_num_dims; CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast(&pixel_values_data), &pixel_values_shape, &pixel_values_num_dims)); - const int64_t* image_grid_thw_data{}; - const int64_t* image_grid_thw_shape{}; - size_t image_grid_thw_num_dims; - CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast(&image_grid_thw_data), - &image_grid_thw_shape, &image_grid_thw_num_dims)); - - num_images = image_grid_thw_shape[0]; + // Get image_grid_thw data from either processor output or computed value + if (image_grid_thw) { + const int64_t* image_grid_thw_shape{}; + size_t image_grid_thw_num_dims; + CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast(&image_grid_thw_data), + &image_grid_thw_shape, &image_grid_thw_num_dims)); + num_images = image_grid_thw_shape[0]; + } else if (computed_grid_data) { + image_grid_thw_data = computed_grid_data; + num_images = computed_grid_num_images; + } // Calculate total image tokens based on grid dimensions // For each image: (temporal * height * width) / (merge_size^2) @@ -60,7 +67,8 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr } // Count the number of vision_start tokens and make sure it matches the number of images - const std::regex vision_start_regex{std::string(vision_start_token)}; + // Need to escape special regex characters in the token + const std::regex vision_start_regex{R"(<\|vision_start\|>)"}; const auto vision_start_begin = std::sregex_iterator(text.begin(), text.end(), vision_start_regex); const auto vision_start_end = std::sregex_iterator(); const auto vision_start_tokens = std::distance(vision_start_begin, vision_start_end); @@ -72,13 +80,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr // For Qwen2-VL, we need to replace vision markers with image_pad tokens // The number of image_pad tokens for each image depends on the image dimensions - if (num_images > 0 && image_grid_thw) { - const int64_t* image_grid_thw_data{}; - const int64_t* image_grid_thw_shape{}; - size_t image_grid_thw_num_dims; - CheckResult(OrtxGetTensorData(image_grid_thw, reinterpret_cast(&image_grid_thw_data), - &image_grid_thw_shape, &image_grid_thw_num_dims)); - + if (num_images > 0 && image_grid_thw_data) { constexpr int64_t merge_size = 2; std::string modified_text; size_t last_pos = 0; @@ -149,7 +151,7 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token auto named_tensors = std::make_unique(); if (!images) { - [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, allocator); + [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, nullptr, 0, allocator); named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared(std::move(input_ids))); return named_tensors; } @@ -161,12 +163,128 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values)); OrtxTensor* image_grid_thw = nullptr; - CheckResult(OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw)); + // Try to get image_grid_thw from processor (second output) + auto status = OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw); + + // Get pixel_values data and shape + const float* pixel_values_data{}; + const int64_t* pixel_values_shape{}; + size_t pixel_values_num_dims; + CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast(&pixel_values_data), + &pixel_values_shape, &pixel_values_num_dims)); + + std::cerr << "DEBUG: pixel_values_num_dims=" << pixel_values_num_dims << " shape=["; + for (size_t i = 0; i < pixel_values_num_dims; ++i) { + if (i > 0) std::cerr << ", "; + std::cerr << pixel_values_shape[i]; + } + std::cerr << "]" << std::endl; + + // If processor doesn't provide image_grid_thw or patched pixel_values, compute them + std::unique_ptr computed_image_grid_thw; + std::unique_ptr patched_pixel_values; + const int64_t* computed_grid_data = nullptr; + int64_t computed_grid_num_images = 0; + + // Check if pixel_values needs patching (shape should be [1, height, width, channels] in HWC format) + if (pixel_values_num_dims == 4 && pixel_values_shape[0] == 1) { + constexpr int64_t patch_size = 14; + constexpr int64_t temporal_patch_size = 2; + + int64_t height = pixel_values_shape[1]; // HWC: [batch, height, width, channels] + int64_t width = pixel_values_shape[2]; + int64_t channels = pixel_values_shape[3]; + + int64_t height_patches = height / patch_size; + int64_t width_patches = width / patch_size; + int64_t total_patches = height_patches * width_patches; + int64_t patch_dim = channels * temporal_patch_size * patch_size * patch_size; // 3*2*14*14 = 1176 + + // Create patched pixel_values: [total_patches, patch_dim] + patched_pixel_values = OrtValue::CreateTensor( + allocator, std::vector{total_patches, patch_dim}); + auto* patched_data = patched_pixel_values->GetTensorMutableData(); + + // Extract patches from single image in HWC format + // Each spatial patch is replicated temporal_patch_size times + int64_t patch_idx = 0; + for (int64_t ph = 0; ph < height_patches; ++ph) { + for (int64_t pw = 0; pw < width_patches; ++pw) { + int64_t h_start = ph * patch_size; + int64_t w_start = pw * patch_size; + + int64_t write_idx = patch_idx * patch_dim; + + // Repeat the same spatial patch temporal_patch_size times + // Output: [temporal, channels, patch_h, patch_w] + for (int64_t t = 0; t < temporal_patch_size; ++t) { + for (int64_t c = 0; c < channels; ++c) { + for (int64_t h = 0; h < patch_size; ++h) { + for (int64_t w = 0; w < patch_size; ++w) { + // HWC format: pixel_values[height][width][channels] + int64_t src_idx = (h_start + h) * width * channels + (w_start + w) * channels + c; + patched_data[write_idx++] = pixel_values_data[src_idx]; + } + } + } + } + patch_idx++; + } + } + + // Create image_grid_thw: [1, 3] for single image + if (status != kOrtxOK || !image_grid_thw) { + computed_image_grid_thw = OrtValue::CreateTensor( + allocator, std::vector{1, 3}); + auto* grid_data = computed_image_grid_thw->GetTensorMutableData(); + + // For a single image: T=1 (one frame), H=height_patches, W=width_patches + // The temporal_patch_size is embedded in the patch dimension (1176 = 3*2*14*14) + grid_data[0] = 1; // Single temporal frame for images + grid_data[1] = height_patches; + grid_data[2] = width_patches; + + computed_grid_data = grid_data; + computed_grid_num_images = 1; + } + } - auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, image_grid_thw, allocator); + auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, + image_grid_thw, computed_grid_data, computed_grid_num_images, allocator); named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared(std::move(input_ids))); - if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + // Use patched pixel_values if we computed it, otherwise use processor output + if (patched_pixel_values) { + // Convert to the correct type if needed + if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) { + // Convert float to bfloat16 + auto shape_vec = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetShape(); + auto bf16_tensor = OrtValue::CreateTensor(allocator, shape_vec); + const float* src = patched_pixel_values->GetTensorData(); + auto* dst = static_cast(bf16_tensor->GetTensorMutableData()); + size_t count = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetElementCount(); + for (size_t i = 0; i < count; ++i) { + dst[i] = Float32ToBFloat16(src[i]); + } + named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), + std::make_shared(std::move(bf16_tensor))); + } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) { + // Convert float to float16 + auto shape_vec = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetShape(); + auto fp16_tensor = OrtValue::CreateTensor(allocator, shape_vec); + const float* src = patched_pixel_values->GetTensorData(); + auto* dst = static_cast(fp16_tensor->GetTensorMutableData()); + size_t count = patched_pixel_values->GetTensorTypeAndShapeInfo()->GetElementCount(); + for (size_t i = 0; i < count; ++i) { + dst[i] = FastFloat32ToFloat16(src[i]); + } + named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), + std::make_shared(std::move(fp16_tensor))); + } else { + named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), + std::make_shared(std::move(patched_pixel_values))); + } + } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { named_tensors->emplace(std::string(Config::Defaults::PixelValuesName), std::make_shared(ProcessTensor(pixel_values, allocator))); } else if (pixel_values_type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) { @@ -177,9 +295,14 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token std::make_shared(ProcessTensor(pixel_values, allocator))); } - // Add image_grid_thw tensor - named_tensors->emplace("image_grid_thw", - std::make_shared(ProcessTensor(image_grid_thw, allocator))); + // Add image_grid_thw tensor (either from processor or computed) + if (image_grid_thw) { + named_tensors->emplace("image_grid_thw", + std::make_shared(ProcessTensor(image_grid_thw, allocator))); + } else if (computed_image_grid_thw) { + named_tensors->emplace("image_grid_thw", + std::make_shared(std::move(computed_image_grid_thw))); + } named_tensors->emplace(std::string(Config::Defaults::NumImageTokens), std::make_shared(std::move(num_img_tokens))); From ecc42044317f9473295d45ec1b1006dbbcdee40a Mon Sep 17 00:00:00 2001 From: apsonawane Date: Sat, 15 Nov 2025 01:48:55 +0000 Subject: [PATCH 4/7] Remove debug prints --- src/models/qwen_image_processor.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp index 2831f9cbc1..3df26425c8 100644 --- a/src/models/qwen_image_processor.cpp +++ b/src/models/qwen_image_processor.cpp @@ -173,13 +173,6 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast(&pixel_values_data), &pixel_values_shape, &pixel_values_num_dims)); - std::cerr << "DEBUG: pixel_values_num_dims=" << pixel_values_num_dims << " shape=["; - for (size_t i = 0; i < pixel_values_num_dims; ++i) { - if (i > 0) std::cerr << ", "; - std::cerr << pixel_values_shape[i]; - } - std::cerr << "]" << std::endl; - // If processor doesn't provide image_grid_thw or patched pixel_values, compute them std::unique_ptr computed_image_grid_thw; std::unique_ptr patched_pixel_values; From df1d9630e2f77813b10e1e97f726a6a385f2a84b Mon Sep 17 00:00:00 2001 From: apsonawane Date: Sat, 15 Nov 2025 02:24:28 +0000 Subject: [PATCH 5/7] Cleanup --- src/models/position_inputs.cpp | 128 ++++++++-------------------- src/models/position_inputs.h | 9 +- src/models/qwen_image_processor.cpp | 35 ++++---- 3 files changed, 55 insertions(+), 117 deletions(-) diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 0787e34691..585ff1cc7f 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -5,6 +5,15 @@ namespace Generators { +// Helper to dispatch type-specific tensor operations +template +void DispatchOnType(ONNXTensorElementDataType type, Func&& func) { + if (type == Ort::TypeToTensorType) + func.template operator()(); + else + func.template operator()(); +} + DefaultPositionInputs::DefaultPositionInputs(const Model& model, State& state, DeviceSpan sequence_lengths_unk, const std::string& attention_mask_name) : model_{model}, state_{state}, @@ -501,13 +510,11 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D position_ids_shape_[2] = 0; // Will be set during first update position_ids_ = std::make_unique(model_.p_device_inputs_, posid_type); - position_ids_next_ = std::make_unique(model_.p_device_inputs_, posid_type); } if (has_mask_input_) { attention_mask_shape_[0] = state_.params_->search.batch_size; attention_mask_shape_[1] = 0; // Will be set during first update attention_mask_ = std::make_unique(model_.p_device_inputs_, type_); - attention_mask_next_ = std::make_unique(model_.p_device_inputs_, type_); } } @@ -534,161 +541,94 @@ void Qwen2VLPositionInputs::AddAttentionMask() { template void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape) { - // For Qwen2-VL, in the prefill stage, position_ids are [3, batch_size, seq_len] - // During generation, they remain [3, batch_size, 1] + // For Qwen2-VL, position_ids are [3, batch_size, seq_len] // The 3 dimensions represent: [temporal, height, width] for mrope + // For text-only content, all 3 dimensions have the same position values [0,1,2,...] auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); auto* position_data = position_ids->GetTensorMutableData(); - - auto position_ids_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array{shape[0], shape[1], 1}, type_); - auto* position_data_next = position_ids_next->GetTensorMutableData(); - // Initialize position IDs - // For text-only content (no vision), all 3 dimensions have the same position values - // This matches the PyTorch get_rope_index behavior where text positions are [0,1,2,...] - // replicated across all 3 mrope dimensions - // Fill position_ids: shape is [3, batch_size, seq_len] for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < shape[1]; ++batch) { for (int64_t pos = 0; pos < shape[2]; ++pos) { - // All 3 dimensions get the same sequential position values for text position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast(pos); } } } - - // Fill position_ids_next for generation: shape is [3, batch_size, 1] - for (int64_t dim = 0; dim < 3; ++dim) { - for (int64_t batch = 0; batch < shape[1]; ++batch) { - // Next position is seq_len (continuing from last position) - position_data_next[dim * shape[1] + batch] = static_cast(shape[2]); - } - } - - // Old multi-batch code removed since we simplified to match PyTorch logic - if (false) { - // Multiple batches - initialize with simple ascending values - // In practice, vision-specific positions would be computed by the model's get_rope_index logic - for (int64_t dim = 0; dim < 3; ++dim) { - for (int64_t batch = 0; batch < shape[1]; ++batch) { - for (int64_t pos = 0; pos < shape[2]; ++pos) { - position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast(pos); - } - position_data_next[dim * shape[1] + batch] = static_cast(shape[2]); - } - } - } - // Move tensors to appropriate device and expand by num_beams + // Move tensor to GPU and expand by num_beams position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, state_.params_->search.num_beams); - position_ids_next_->ort_tensor_ = model_.ExpandInputs(position_ids_next, state_.params_->search.num_beams); - if (state_.params_->use_graph_capture) - position_ids_next_->MakeStatic(); position_ids_shape_[1] *= state_.params_->search.num_beams; state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); } template void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan next_tokens, std::array shape) { - // Standard 2D attention mask initialization auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); auto* mask_data = attention_mask->GetTensorMutableData(); - - auto attention_mask_next = OrtValue::CreateTensor(model_.allocator_cpu_, std::array{shape[0], shape[1] + 1}, type_); - auto* mask_data_next = attention_mask_next->GetTensorMutableData(); - // Set mask to 1 for all positions (assuming no padding in first iteration) + // Set mask to 1 for all positions (no padding) std::fill_n(mask_data, shape[0] * shape[1], static_cast(1)); - std::fill_n(mask_data_next, shape[0] * (shape[1] + 1), static_cast(1)); - // Move tensors to device and expand by num_beams + // Move tensor to GPU and expand by num_beams attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams); - attention_mask_next_->ort_tensor_ = model_.ExpandInputs(attention_mask_next, state_.params_->search.num_beams); - if (state_.params_->use_graph_capture) - attention_mask_next_->MakeStatic(); attention_mask_shape_[0] *= state_.params_->search.num_beams; state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor(); } -void Qwen2VLPositionInputs::Update3DPositionIDs(int total_length, int new_length) { - // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs) +void Qwen2VLPositionInputs::Update3DPositionIDs(int base_pos) { auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_); - // Update position values for generation phase - // During generation, we increment all 3 dimensions uniformly for text generation - if (type_ == Ort::TypeToTensorType) { - auto* data = position_ids->GetTensorMutableData(); + DispatchOnType(type_, [&]() { + auto* data = position_ids->GetTensorMutableData(); for (int64_t dim = 0; dim < 3; ++dim) { for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = - static_cast(total_length - new_length + pos); + static_cast(base_pos + pos); } } } - } else { - auto* data = position_ids->GetTensorMutableData(); - for (int64_t dim = 0; dim < 3; ++dim) { - for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { - for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { - data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = - static_cast(total_length - new_length + pos); - } - } - } - } + }); - // Move to GPU if needed position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1); state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); } -void Qwen2VLPositionInputs::UpdateAttentionMask(int total_length, int new_length) { - // Create tensor on CPU (like in CreateAndInitialize3DPositionIDs) +void Qwen2VLPositionInputs::UpdateAttentionMask() { auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, attention_mask_shape_, type_); - // Update attention mask - typically all 1s during generation - if (type_ == Ort::TypeToTensorType) { - auto* mask_data = attention_mask->GetTensorMutableData(); - std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); - } else { - auto* mask_data = attention_mask->GetTensorMutableData(); - std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); - } + DispatchOnType(type_, [&]() { + auto* mask_data = attention_mask->GetTensorMutableData(); + std::fill_n(mask_data, attention_mask_shape_[0] * attention_mask_shape_[1], static_cast(1)); + }); - // Move to GPU if needed attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, 1); - state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor(); } void Qwen2VLPositionInputs::Update(DeviceSpan next_tokens, int total_length, int new_length) { if (has_posid_input_) { + position_ids_shape_[2] = new_length; if (is_first_update_) { - position_ids_shape_[2] = new_length; - if (type_ == Ort::TypeToTensorType) - CreateAndInitialize3DPositionIDs(next_tokens, position_ids_shape_); - else - CreateAndInitialize3DPositionIDs(next_tokens, position_ids_shape_); + DispatchOnType(type_, [&]() { + CreateAndInitialize3DPositionIDs(next_tokens, position_ids_shape_); + }); } else { - position_ids_shape_[2] = new_length; // Update shape before Update3DPositionIDs - Update3DPositionIDs(total_length, new_length); + Update3DPositionIDs(total_length - new_length); } } if (has_mask_input_) { if (is_first_update_) { attention_mask_shape_[1] = new_length; - if (type_ == Ort::TypeToTensorType) - CreateAndInitializeAttentionMask(next_tokens, attention_mask_shape_); - else - CreateAndInitializeAttentionMask(next_tokens, attention_mask_shape_); + DispatchOnType(type_, [&]() { + CreateAndInitializeAttentionMask(next_tokens, attention_mask_shape_); + }); } else { - // UpdateAttentionMask checks old shape, then we update it - UpdateAttentionMask(total_length, new_length); - attention_mask_shape_[1] = total_length; // Update to current total length + attention_mask_shape_[1] = total_length; + UpdateAttentionMask(); } } diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h index 644f0f3a63..867196e008 100644 --- a/src/models/position_inputs.h +++ b/src/models/position_inputs.h @@ -129,11 +129,11 @@ struct Qwen2VLPositionInputs : PositionInputs { template void CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape); - void Update3DPositionIDs(int total_length, int new_length); + void Update3DPositionIDs(int base_pos); template void CreateAndInitializeAttentionMask(DeviceSpan next_tokens, std::array shape); - void UpdateAttentionMask(int total_length, int new_length); + void UpdateAttentionMask(); const Model& model_; State& state_; @@ -146,15 +146,12 @@ struct Qwen2VLPositionInputs : PositionInputs { bool has_mask_input_{false}; bool has_posid_input_{false}; - std::array position_ids_shape_{}; // {4, batch_size, sequence_length} for 3D positions + std::array position_ids_shape_{}; // {3, batch_size, sequence_length} for 3D positions std::unique_ptr position_ids_; - std::unique_ptr position_ids_next_; // Replaces position_ids_ after the first Run() call std::array attention_mask_shape_{}; // {batch_size, sequence_length} std::unique_ptr attention_mask_; - std::unique_ptr attention_mask_next_; // Replaces attention_mask_ after each run - std::unique_ptr rope_deltas_; // Cached rope deltas for position calculation bool is_first_update_{true}; }; diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp index 3df26425c8..198d087488 100644 --- a/src/models/qwen_image_processor.cpp +++ b/src/models/qwen_image_processor.cpp @@ -10,6 +10,8 @@ namespace Generators { namespace { +constexpr int64_t kMergeSize = 2; // Qwen2-VL merge size for vision tokens + std::tuple, std::unique_ptr> ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt, OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, @@ -44,12 +46,11 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr // Calculate total image tokens based on grid dimensions // For each image: (temporal * height * width) / (merge_size^2) - constexpr int64_t merge_size = 2; for (int64_t i = 0; i < num_images; ++i) { int64_t t = image_grid_thw_data[i * 3 + 0]; int64_t h = image_grid_thw_data[i * 3 + 1]; int64_t w = image_grid_thw_data[i * 3 + 2]; - total_image_tokens += (t * h * w) / (merge_size * merge_size); + total_image_tokens += (t * h * w) / (kMergeSize * kMergeSize); } } @@ -81,7 +82,6 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr // For Qwen2-VL, we need to replace vision markers with image_pad tokens // The number of image_pad tokens for each image depends on the image dimensions if (num_images > 0 && image_grid_thw_data) { - constexpr int64_t merge_size = 2; std::string modified_text; size_t last_pos = 0; size_t image_idx = 0; @@ -96,7 +96,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr int64_t t = image_grid_thw_data[image_idx * 3 + 0]; int64_t h = image_grid_thw_data[image_idx * 3 + 1]; int64_t w = image_grid_thw_data[image_idx * 3 + 2]; - int64_t num_pads = (t * h * w) / (merge_size * merge_size); + int64_t num_pads = (t * h * w) / (kMergeSize * kMergeSize); // Add vision_start, image_pad tokens, and vision_end modified_text += vision_start_token; @@ -181,17 +181,18 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token // Check if pixel_values needs patching (shape should be [1, height, width, channels] in HWC format) if (pixel_values_num_dims == 4 && pixel_values_shape[0] == 1) { - constexpr int64_t patch_size = 14; - constexpr int64_t temporal_patch_size = 2; + constexpr int64_t kPatchSize = 14; + constexpr int64_t kTemporalPatchSize = 2; + constexpr int64_t kChannels = 3; int64_t height = pixel_values_shape[1]; // HWC: [batch, height, width, channels] int64_t width = pixel_values_shape[2]; int64_t channels = pixel_values_shape[3]; - int64_t height_patches = height / patch_size; - int64_t width_patches = width / patch_size; + int64_t height_patches = height / kPatchSize; + int64_t width_patches = width / kPatchSize; int64_t total_patches = height_patches * width_patches; - int64_t patch_dim = channels * temporal_patch_size * patch_size * patch_size; // 3*2*14*14 = 1176 + int64_t patch_dim = channels * kTemporalPatchSize * kPatchSize * kPatchSize; // Create patched pixel_values: [total_patches, patch_dim] patched_pixel_values = OrtValue::CreateTensor( @@ -199,21 +200,21 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token auto* patched_data = patched_pixel_values->GetTensorMutableData(); // Extract patches from single image in HWC format - // Each spatial patch is replicated temporal_patch_size times + // Each spatial patch is replicated kTemporalPatchSize times int64_t patch_idx = 0; for (int64_t ph = 0; ph < height_patches; ++ph) { for (int64_t pw = 0; pw < width_patches; ++pw) { - int64_t h_start = ph * patch_size; - int64_t w_start = pw * patch_size; + int64_t h_start = ph * kPatchSize; + int64_t w_start = pw * kPatchSize; int64_t write_idx = patch_idx * patch_dim; - // Repeat the same spatial patch temporal_patch_size times + // Repeat the same spatial patch kTemporalPatchSize times // Output: [temporal, channels, patch_h, patch_w] - for (int64_t t = 0; t < temporal_patch_size; ++t) { + for (int64_t t = 0; t < kTemporalPatchSize; ++t) { for (int64_t c = 0; c < channels; ++c) { - for (int64_t h = 0; h < patch_size; ++h) { - for (int64_t w = 0; w < patch_size; ++w) { + for (int64_t h = 0; h < kPatchSize; ++h) { + for (int64_t w = 0; w < kPatchSize; ++w) { // HWC format: pixel_values[height][width][channels] int64_t src_idx = (h_start + h) * width * channels + (w_start + w) * channels + c; patched_data[write_idx++] = pixel_values_data[src_idx]; @@ -232,7 +233,7 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token auto* grid_data = computed_image_grid_thw->GetTensorMutableData(); // For a single image: T=1 (one frame), H=height_patches, W=width_patches - // The temporal_patch_size is embedded in the patch dimension (1176 = 3*2*14*14) + // The kTemporalPatchSize is embedded in the patch dimension grid_data[0] = 1; // Single temporal frame for images grid_data[1] = height_patches; grid_data[2] = width_patches; From fa991b59a32808c8dba68b9109568b2ea1758311 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 25 Nov 2025 01:16:41 +0000 Subject: [PATCH 6/7] update position ids for mrope --- src/config.cpp | 10 ++ src/config.h | 10 ++ src/models/model.h | 2 +- src/models/multi_modal.cpp | 35 +++- src/models/multi_modal.h | 7 +- src/models/position_inputs.cpp | 269 +++++++++++++++++++++++++--- src/models/position_inputs.h | 41 +++-- src/models/qwen_image_processor.cpp | 16 +- src/models/qwen_image_processor.h | 3 +- 9 files changed, 340 insertions(+), 53 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 9a9d48ab9b..3f46b990f8 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -653,6 +653,10 @@ struct Vision_Element : JSON::Element { v_.config_filename = JSON::Get(value); } else if (name == "adapter_filename") { v_.adapter_filename = JSON::Get(value); + } else if (name == "spatial_merge_size") { + v_.spatial_merge_size = static_cast(JSON::Get(value)); + } else if (name == "tokens_per_second") { + v_.tokens_per_second = static_cast(JSON::Get(value)); } else { throw JSON::unknown_value_error{}; } @@ -858,6 +862,12 @@ struct Model_Element : JSON::Element { v_.decoder_start_token_id = static_cast(JSON::Get(value)); } else if (name == "sep_token_id") { v_.sep_token_id = static_cast(JSON::Get(value)); + } else if (name == "image_token_id") { + v_.image_token_id = static_cast(JSON::Get(value)); + } else if (name == "video_token_id") { + v_.video_token_id = static_cast(JSON::Get(value)); + } else if (name == "vision_start_token_id") { + v_.vision_start_token_id = static_cast(JSON::Get(value)); } else { throw JSON::unknown_value_error{}; } diff --git a/src/config.h b/src/config.h index 76b3d4c241..c03a1d1860 100644 --- a/src/config.h +++ b/src/config.h @@ -107,6 +107,12 @@ struct Config { int bos_token_id{}; // The id of the beginning-of-stream token. int sep_token_id{}; // The id of the separation token. int decoder_start_token_id{}; // If an encoder-decoder model starts decoding with a different token than bos, the id of that token. + + // Qwen2-VL specific token IDs + int image_token_id{}; + int video_token_id{}; + int vision_start_token_id{}; + int vocab_size{}; int context_length{}; @@ -160,6 +166,10 @@ struct Config { std::string config_filename{"processor_config.json"}; std::optional adapter_filename{}; + // Qwen2-VL specific vision config values + int spatial_merge_size{2}; + float tokens_per_second{2.0f}; + struct Inputs { std::string pixel_values{Defaults::PixelValuesName}; std::string image_sizes{Defaults::ImageSizesName}; diff --git a/src/models/model.h b/src/models/model.h index 0e50059702..62034bb1a3 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -177,4 +177,4 @@ struct Model : std::enable_shared_from_this, LeakChecked, External std::map> pipeline_session_options_; }; -} // namespace Generators +} // namespace Generators \ No newline at end of file diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp index 48ad8d5fcf..113b70b1e4 100644 --- a/src/models/multi_modal.cpp +++ b/src/models/multi_modal.cpp @@ -3,6 +3,7 @@ #include "../generators.h" #include "multi_modal.h" +#include namespace Generators { @@ -178,10 +179,12 @@ DeviceSpan EmbeddingState::Run(int current_length, DeviceSpan& n return {}; } -DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan sequence_lengths, const GeneratorParams& params) +DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan sequence_lengths, + const GeneratorParams& params) : State{params, model}, model_{model}, - position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} { + position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} +{ inputs_embeds_.Add(); position_inputs_->Add(); logits_.Add(); @@ -207,6 +210,13 @@ void DecoderState::UpdateInputsOutputs(DeviceSpan& next_tokens, int tot inputs_embeds_.UpdateSequenceLength(new_length); } +// Overload for pipeline to call +void DecoderState::UpdateInputsOutputs(DeviceSpan& next_tokens, int total_length, DeviceSpan beam_indices, size_t new_length) { + kv_cache_.Update(beam_indices, total_length); + logits_.Update(next_tokens, new_length); + inputs_embeds_.UpdateSequenceLength(new_length); +} + MultiModalPipelineState::MultiModalPipelineState(const MultiModalLanguageModel& model, DeviceSpan sequence_lengths, const GeneratorParams& params) : State{params, model}, model_{model}, @@ -243,6 +253,25 @@ void MultiModalPipelineState::SetExtraInputs(const std::vector& extr speech_state_->SetExtraInputs(extra_inputs, num_audio_tokens_); } embedding_state_->SetExtraInputs(num_images_, num_image_tokens_, num_audio_tokens_); + + // Set the grid tensors for Qwen2-VL if present + if (auto* qwen_pos_inputs = dynamic_cast(decoder_state_->position_inputs_.get())) { + std::shared_ptr img_grid, vid_grid, sec_grid; + + for (const auto& input : extra_inputs) { + if (input.name == Config::Defaults::ImageGridThwName) { + img_grid = input.tensor; + } else if (input.name == "video_grid_thw") { + vid_grid = input.tensor; + } else if (input.name == "second_per_grid_ts") { + sec_grid = input.tensor; + } + } + + if (img_grid || vid_grid) { + qwen_pos_inputs->SetGridTensors(img_grid, vid_grid, sec_grid); + } + } } DeviceSpan MultiModalPipelineState::Run(int current_length, DeviceSpan& next_tokens, DeviceSpan next_indices) { @@ -357,4 +386,4 @@ OrtValue* MultiModalPipelineState::GetOutput(const char* name) { return State::GetOutput(name); }; -} // namespace Generators +} // namespace Generators \ No newline at end of file diff --git a/src/models/multi_modal.h b/src/models/multi_modal.h index 0cbe4e527b..8f17b004b6 100644 --- a/src/models/multi_modal.h +++ b/src/models/multi_modal.h @@ -18,7 +18,7 @@ struct MultiModalLanguageModel : Model { MultiModalLanguageModel(const MultiModalLanguageModel&) = delete; MultiModalLanguageModel& operator=(const MultiModalLanguageModel&) = delete; - std::unique_ptr CreateState(DeviceSpan sequence_lengths, const GeneratorParams& params) const; + std::unique_ptr CreateState(DeviceSpan sequence_lengths, const GeneratorParams& params) const override; std::unique_ptr vision_session_; // pixel_values, [image_attention_mask], image_sizes -> image_features std::unique_ptr speech_session_; // audio_embeds, audio_sizes, audio_projection_mode -> audio_features @@ -96,11 +96,12 @@ struct DecoderState : State { DecoderState& operator=(const DecoderState&) = delete; DeviceSpan Run(int current_length, DeviceSpan& next_tokens, DeviceSpan next_indices) override; + void UpdateInputsOutputs(DeviceSpan& next_tokens, int current_length, DeviceSpan beam_indices); private: friend struct MultiModalPipelineState; - void UpdateInputsOutputs(DeviceSpan& next_tokens, int current_length, DeviceSpan beam_indices); + void UpdateInputsOutputs(DeviceSpan& next_tokens, int current_length, DeviceSpan beam_indices, size_t new_length); const MultiModalLanguageModel& model_; Embeddings inputs_embeds_{*this, Embeddings::Mode::Input, // Model input @@ -144,4 +145,4 @@ struct MultiModalPipelineState : State { const std::string speech_adapter_name_{"speech"}; }; -} // namespace Generators +} // namespace Generators \ No newline at end of file diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 585ff1cc7f..5ed02b751e 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -2,11 +2,14 @@ #include "model.h" #include "position_inputs.h" #include "model_type.h" +#include +#include +#include // For std::round namespace Generators { // Helper to dispatch type-specific tensor operations -template +template void DispatchOnType(ONNXTensorElementDataType type, Func&& func) { if (type == Ort::TypeToTensorType) func.template operator()(); @@ -490,7 +493,12 @@ void WindowedPositionInputs::Update(DeviceSpan next_tokens, int total_l // Qwen2VLPositionInputs implementation Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, DeviceSpan sequence_lengths_unk) : model_{model}, - state_{state} { + state_{state}, + image_token_id_{model.config_->model.image_token_id}, + video_token_id_{model.config_->model.video_token_id}, + vision_start_token_id_{model.config_->model.vision_start_token_id}, + tokens_per_second_{model.config_->model.vision.tokens_per_second}, + spatial_merge_size_{model.config_->model.vision.spatial_merge_size} { has_mask_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.attention_mask); has_posid_input_ = model_.session_info_.HasInput(model_.config_->model.decoder.inputs.position_ids); @@ -498,11 +506,10 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D if (has_mask_input_) { type_ = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.attention_mask); } - - ONNXTensorElementDataType posid_type = type_; + if (has_posid_input_) { - posid_type = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids); - + ONNXTensorElementDataType posid_type = model_.session_info_.GetInputDataType(model_.config_->model.decoder.inputs.position_ids); + // Set up 3D position IDs shape: [3, batch_size, sequence_length] // The 3 dimensions represent temporal, height, and width for mrope position_ids_shape_[0] = 3; @@ -518,6 +525,14 @@ Qwen2VLPositionInputs::Qwen2VLPositionInputs(const Model& model, State& state, D } } +void Qwen2VLPositionInputs::SetGridTensors(const std::shared_ptr& image_grid_thw, + const std::shared_ptr& video_grid_thw, + const std::shared_ptr& second_per_grid_ts) { + image_grid_thw_ = image_grid_thw; + video_grid_thw_ = video_grid_thw; + second_per_grid_ts_ = second_per_grid_ts; +} + void Qwen2VLPositionInputs::Add() { if (has_posid_input_) { AddPositionIDs(); @@ -541,35 +556,221 @@ void Qwen2VLPositionInputs::AddAttentionMask() { template void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape) { - // For Qwen2-VL, position_ids are [3, batch_size, seq_len] - // The 3 dimensions represent: [temporal, height, width] for mrope - // For text-only content, all 3 dimensions have the same position values [0,1,2,...] - + // Replicates the logic from HuggingFace's `get_rope_index` + // `shape` is [3, batch_size, seq_len] (before beam expansion) + // `next_tokens` is [batch_size, seq_len] + int64_t num_dims = shape[0]; // Should be 3 + int64_t batch_size = shape[1]; + int64_t seq_len = shape[2]; + auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); auto* position_data = position_ids->GetTensorMutableData(); - // Fill position_ids: shape is [3, batch_size, seq_len] - for (int64_t dim = 0; dim < 3; ++dim) { - for (int64_t batch = 0; batch < shape[1]; ++batch) { - for (int64_t pos = 0; pos < shape[2]; ++pos) { - position_data[dim * shape[1] * shape[2] + batch * shape[2] + pos] = static_cast(pos); + // Get spans for grid_thw tensors (on CPU) + std::span image_grid_thw_span; + if (image_grid_thw_) { + image_grid_thw_span = std::span(image_grid_thw_->GetData(), image_grid_thw_->GetElementCount()); + } + + std::span video_grid_thw_span; + if (video_grid_thw_) { + video_grid_thw_span = std::span(video_grid_thw_->GetData(), video_grid_thw_->GetElementCount()) ; + } + + std::span second_per_grid_ts_span; + if (second_per_grid_ts_) { + // Qwen 2.5 processor outputs float32 for this + if (second_per_grid_ts_->GetType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) + throw std::runtime_error("second_per_grid_ts must be float32."); + second_per_grid_ts_span = std::span(second_per_grid_ts_->GetData(), second_per_grid_ts_->GetElementCount()); + } + + auto input_ids_span = next_tokens.CpuSpan(); + int image_index = 0; + int video_index = 0; + rope_deltas_.clear(); + + for (int64_t b = 0; b < batch_size; ++b) { + auto input_ids = input_ids_span.subspan(b * seq_len, seq_len); + + int64_t image_nums = 0; + int64_t video_nums = 0; + + // Count images/videos for this batch item by checking the token *after* vision_start_token_id + for (size_t s = 0; s < seq_len - 1; ++s) { + if (input_ids[s] == vision_start_token_id_) { + if (input_ids[s + 1] == image_token_id_) { + image_nums++; + } else if (input_ids[s + 1] == video_token_id_) { + video_nums++; + } + } + } + + int64_t st = 0; + int64_t remain_images = image_nums; + int64_t remain_videos = video_nums; + T st_idx = 0; + T max_pos_for_batch = 0; + + for (int64_t k = 0; k < image_nums + video_nums; ++k) { + int64_t ed_image = seq_len + 1; + int64_t ed_video = seq_len + 1; + + // Find next image_token_id (after a vision_start_token_id) + if (remain_images > 0) { + for (int64_t s = st; s < seq_len - 1; ++s) { + if (input_ids[s] == vision_start_token_id_ && input_ids[s + 1] == image_token_id_) { + ed_image = s + 1; // Point to the image_token_id + break; + } + } + } + // Find next video_token_id (after a vision_start_token_id) + if (remain_videos > 0) { + for (int64_t s = st; s < seq_len - 1; ++s) { + if (input_ids[s] == vision_start_token_id_ && input_ids[s + 1] == video_token_id_) { + ed_video = s + 1; // Point to the video_token_id + break; + } + } + } + + int64_t ed; + int64_t t, h, w; + float second_per_grid_t = 0.0f; + + if (ed_image < ed_video) { + // Process image + if (image_index * 3 + 2 >= image_grid_thw_span.size()) + throw std::runtime_error("Not enough image_grid_thw data for image tokens."); + t = image_grid_thw_span[image_index * 3 + 0]; + h = image_grid_thw_span[image_index * 3 + 1]; + w = image_grid_thw_span[image_index * 3 + 2]; + second_per_grid_t = 0.0f; // Images have 0 time delta + image_index++; + remain_images--; + ed = ed_image; + } else { + // Process video + if (video_index * 3 + 2 >= video_grid_thw_span.size()) + throw std::runtime_error("Not enough video_grid_thw data for video tokens."); + t = video_grid_thw_span[video_index * 3 + 0]; + h = video_grid_thw_span[video_index * 3 + 1]; + w = video_grid_thw_span[video_index * 3 + 2]; + if (second_per_grid_ts_span.empty() || video_index >= second_per_grid_ts_span.size()) { + second_per_grid_t = 1.0f; // Default from Python + } else { + second_per_grid_t = second_per_grid_ts_span[video_index]; + } + video_index++; + remain_videos--; + ed = ed_video; + } + + int64_t llm_grid_t = t; + int64_t llm_grid_h = h / spatial_merge_size_; + int64_t llm_grid_w = w / spatial_merge_size_; + + // 1. Fill Text Part + // Text runs from `st` up to `ed-1` (which is the <|vision_start|> token) + int64_t text_len = ed - st; + st_idx = (k > 0 || b > 0) ? max_pos_for_batch + 1 : 0; + T current_pos = st_idx; + + for (int64_t s = 0; s < text_len; ++s) { + int64_t current_token_idx = st + s; + if (input_ids[current_token_idx] == model_.config_->model.pad_token_id) { + position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = 0; + position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = 0; + position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = 0; + } else { + position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos; + position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos; + position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos; + max_pos_for_batch = current_pos; + current_pos++; // Only increment position for non-pad tokens + } + } + + // 2. Fill Vision Part + st_idx = max_pos_for_batch + 1; + int64_t vision_len = llm_grid_t * llm_grid_h * llm_grid_w; + for (int64_t s = 0; s < vision_len; ++s) { + int64_t gt = s / (llm_grid_h * llm_grid_w); + int64_t gh = (s / llm_grid_w) % llm_grid_h; + int64_t gw = s % llm_grid_w; + + // Round to nearest integer for temporal position + // Note: huggingface code use truncation/floor (time_tensor_long = time_tensor.long() when converting time coordinates. + // This will cause slight deviation from the reference during parity comparsion. + T t_pos = static_cast(std::round(gt * second_per_grid_t * tokens_per_second_)) + st_idx; + T h_pos = static_cast(gh) + st_idx; + T w_pos = static_cast(gw) + st_idx; + + // Vision tokens are guaranteed not to be padding + position_data[0 * batch_size * seq_len + b * seq_len + (ed + s)] = t_pos; + position_data[1 * batch_size * seq_len + b * seq_len + (ed + s)] = h_pos; + position_data[2 * batch_size * seq_len + b * seq_len + (ed + s)] = w_pos; + max_pos_for_batch = std::max({max_pos_for_batch, t_pos, h_pos, w_pos}); + } + st = ed + vision_len; // New start is after the vision tokens + } + + // 3. Fill Remaining Text Part + if (st < seq_len) { + st_idx = (max_pos_for_batch == 0 && st == 0) ? 0 : max_pos_for_batch + 1; + int64_t text_len = seq_len - st; + T current_pos = st_idx; + for (int64_t s = 0; s < text_len; ++s) { + int64_t current_token_idx = st + s; + if (input_ids[current_token_idx] == model_.config_->model.pad_token_id) { + position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = 0; + position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = 0; + position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = 0; + } else { + position_data[0 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos; + position_data[1 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos; + position_data[2 * batch_size * seq_len + b * seq_len + current_token_idx] = current_pos; + max_pos_for_batch = current_pos; + current_pos++; // Only increment position for non-pad tokens + } } } + rope_deltas_.push_back(max_pos_for_batch + 1 - seq_len); } // Move tensor to GPU and expand by num_beams position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, state_.params_->search.num_beams); position_ids_shape_[1] *= state_.params_->search.num_beams; state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); + + // Expand rope_deltas_ + std::vector expanded_deltas; + for (int64_t delta : rope_deltas_) { + for (int b = 0; b < state_.params_->search.num_beams; ++b) { + expanded_deltas.push_back(delta); + } + } + rope_deltas_ = std::move(expanded_deltas); } template void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan next_tokens, std::array shape) { auto attention_mask = OrtValue::CreateTensor(model_.allocator_cpu_, shape, type_); auto* mask_data = attention_mask->GetTensorMutableData(); - - // Set mask to 1 for all positions (no padding) - std::fill_n(mask_data, shape[0] * shape[1], static_cast(1)); + auto input_ids_span = next_tokens.CpuSpan(); + int64_t batch_size = shape[0]; + int64_t seq_len = shape[1]; + + for (int64_t b = 0; b < batch_size; ++b) { + for (int64_t s = 0; s < seq_len; ++s) { + int64_t current_token_idx = b * seq_len + s; + mask_data[current_token_idx] = (input_ids_span[current_token_idx] == model_.config_->model.pad_token_id) + ? static_cast(0) + : static_cast(1); + } + } // Move tensor to GPU and expand by num_beams attention_mask_->ort_tensor_ = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams); @@ -578,21 +779,34 @@ void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan } void Qwen2VLPositionInputs::Update3DPositionIDs(int base_pos) { + // This is the generation step (decode) + // base_pos is cache_position[0] auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_); + int64_t batch_size = position_ids_shape_[1]; // This is already expanded (batch*beams) + int64_t seq_len = position_ids_shape_[2]; // This will be 1 for generation + + if (rope_deltas_.size() != batch_size) { + throw std::runtime_error("rope_deltas size mismatch with batch_size * num_beams."); + } DispatchOnType(type_, [&]() { auto* data = position_ids->GetTensorMutableData(); for (int64_t dim = 0; dim < 3; ++dim) { - for (int64_t batch = 0; batch < position_ids_shape_[1]; ++batch) { - for (int64_t pos = 0; pos < position_ids_shape_[2]; ++pos) { - data[dim * position_ids_shape_[1] * position_ids_shape_[2] + batch * position_ids_shape_[2] + pos] = - static_cast(base_pos + pos); + for (int64_t b = 0; b < batch_size; ++b) { + for (int64_t s = 0; s < seq_len; ++s) { + // From Python: delta = (cache_position[0] + self.rope_deltas) + // cache_position[0] is `base_pos`. + T delta = static_cast(base_pos + rope_deltas_[b]); + // Python: position_ids = position_ids + delta + // `position_ids` for new token is just [0, 1, ...] + T pos = static_cast(s); + data[dim * batch_size * seq_len + b * seq_len + s] = delta + pos; } } } }); - position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1); + position_ids_->ort_tensor_ = model_.ExpandInputs(position_ids, 1); // No beam expansion needed, already expanded state_.inputs_[posid_input_index_] = position_ids_->GetOrtTensor(); } @@ -631,12 +845,15 @@ void Qwen2VLPositionInputs::Update(DeviceSpan next_tokens, int total_le UpdateAttentionMask(); } } - + is_first_update_ = false; } void Qwen2VLPositionInputs::RewindTo(size_t index) { // For Qwen2-VL, we need to handle rewinding for beam search + // This is a simplified rewind, just updating the shape. + // A full rewind would require re-calculating rope_deltas if we rewound into the prompt. + // For now, we assume rewind only happens during generation. if (has_posid_input_) { position_ids_shape_[2] = static_cast(index); } @@ -650,7 +867,7 @@ std::unique_ptr CreatePositionInputs(State& state, DeviceSpanmodel.type)) { return std::make_unique(state.model_, state, sequence_lengths); } - + if (state.model_.config_->model.decoder.sliding_window.has_value() && state.model_.config_->model.decoder.sliding_window->slide_inputs) { return std::make_unique(state); } else { @@ -658,4 +875,4 @@ std::unique_ptr CreatePositionInputs(State& state, DeviceSpan sequence_lengths_unk); Qwen2VLPositionInputs(const Qwen2VLPositionInputs&) = delete; @@ -123,14 +125,18 @@ struct Qwen2VLPositionInputs : PositionInputs { void Update(DeviceSpan next_tokens, int total_length, int new_length) override; void RewindTo(size_t index) override; + void SetGridTensors(const std::shared_ptr& image_grid_thw, + const std::shared_ptr& video_grid_thw, + const std::shared_ptr& second_per_grid_ts); + private: void AddPositionIDs(); void AddAttentionMask(); - + template void CreateAndInitialize3DPositionIDs(DeviceSpan next_tokens, std::array shape); void Update3DPositionIDs(int base_pos); - + template void CreateAndInitializeAttentionMask(DeviceSpan next_tokens, std::array shape); void UpdateAttentionMask(); @@ -141,20 +147,33 @@ struct Qwen2VLPositionInputs : PositionInputs { size_t mask_input_index_{~0U}; size_t posid_input_index_{~0U}; - ONNXTensorElementDataType type_; // Common type for position_ids and attention_mask + ONNXTensorElementDataType type_; bool has_mask_input_{false}; bool has_posid_input_{false}; std::array position_ids_shape_{}; // {3, batch_size, sequence_length} for 3D positions std::unique_ptr position_ids_; - + std::array attention_mask_shape_{}; // {batch_size, sequence_length} std::unique_ptr attention_mask_; - + bool is_first_update_{true}; + + // Cached data from processor + std::shared_ptr image_grid_thw_; + std::shared_ptr video_grid_thw_; + std::shared_ptr second_per_grid_ts_; + std::vector rope_deltas_; + + // Config values initialized from model.config_ in constructor + const int32_t image_token_id_; + const int32_t video_token_id_; + const int32_t vision_start_token_id_; + const float tokens_per_second_; + const int32_t spatial_merge_size_; }; std::unique_ptr CreatePositionInputs(State& state, DeviceSpan sequence_lengths, const std::string& attention_mask_name); -} // namespace Generators +} // namespace Generators \ No newline at end of file diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp index 198d087488..3cd0190aff 100644 --- a/src/models/qwen_image_processor.cpp +++ b/src/models/qwen_image_processor.cpp @@ -10,13 +10,12 @@ namespace Generators { namespace { -constexpr int64_t kMergeSize = 2; // Qwen2-VL merge size for vision tokens - +// constexpr int64_t kMergeSize = 2; // Qwen2-VL merge size for vision tokens std::tuple, std::unique_ptr> ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt, OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, const int64_t* computed_grid_data, int64_t computed_grid_num_images, - Ort::Allocator& allocator) { + Ort::Allocator& allocator, int64_t spatial_merge_size) { constexpr char vision_start_token[] = "<|vision_start|>"; constexpr char vision_end_token[] = "<|vision_end|>"; constexpr char image_pad_token[] = "<|image_pad|>"; @@ -50,7 +49,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr int64_t t = image_grid_thw_data[i * 3 + 0]; int64_t h = image_grid_thw_data[i * 3 + 1]; int64_t w = image_grid_thw_data[i * 3 + 2]; - total_image_tokens += (t * h * w) / (kMergeSize * kMergeSize); + total_image_tokens += (t * h * w) / (spatial_merge_size * spatial_merge_size); } } @@ -96,7 +95,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr int64_t t = image_grid_thw_data[image_idx * 3 + 0]; int64_t h = image_grid_thw_data[image_idx * 3 + 1]; int64_t w = image_grid_thw_data[image_idx * 3 + 2]; - int64_t num_pads = (t * h * w) / (kMergeSize * kMergeSize); + int64_t num_pads = (t * h * w) / (spatial_merge_size * spatial_merge_size); // Add vision_start, image_pad tokens, and vision_end modified_text += vision_start_token; @@ -136,7 +135,8 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr } // namespace QwenImageProcessor::QwenImageProcessor(Config& config, const SessionInfo& session_info) - : pixel_values_type_{session_info.GetInputDataType(config.model.vision.inputs.pixel_values)} { + : pixel_values_type_{session_info.GetInputDataType(config.model.vision.inputs.pixel_values)}, + spatial_merge_size_{config.model.vision.spatial_merge_size} { const auto processor_config = (config.config_path / fs::path(config.model.vision.config_filename)).string(); CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str())); @@ -151,7 +151,7 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token auto named_tensors = std::make_unique(); if (!images) { - [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, nullptr, 0, allocator); + [[maybe_unused]] auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, nullptr, nullptr, nullptr, 0, allocator, spatial_merge_size_); named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared(std::move(input_ids))); return named_tensors; } @@ -244,7 +244,7 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token } auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, - image_grid_thw, computed_grid_data, computed_grid_num_images, allocator); + image_grid_thw, computed_grid_data, computed_grid_num_images, allocator, spatial_merge_size_); named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared(std::move(input_ids))); // Use patched pixel_values if we computed it, otherwise use processor output diff --git a/src/models/qwen_image_processor.h b/src/models/qwen_image_processor.h index a116a2c67c..ce1ba26f0b 100644 --- a/src/models/qwen_image_processor.h +++ b/src/models/qwen_image_processor.h @@ -15,6 +15,7 @@ struct QwenImageProcessor : Processor { ort_extensions::OrtxObjectPtr processor_; ONNXTensorElementDataType pixel_values_type_; + int64_t spatial_merge_size_; }; -} // namespace Generators +} // namespace Generators \ No newline at end of file From a75bb8b16801aa0ff83c4e5df6a1ace2753d3b11 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 25 Nov 2025 19:44:37 +0000 Subject: [PATCH 7/7] format --- src/models/multi_modal.cpp | 3 +- src/models/multi_modal.h | 4 +- src/models/position_inputs.cpp | 12 +++--- src/models/qwen_image_processor.cpp | 58 ++++++++++++++--------------- 4 files changed, 38 insertions(+), 39 deletions(-) diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp index 113b70b1e4..5abb9f4c2a 100644 --- a/src/models/multi_modal.cpp +++ b/src/models/multi_modal.cpp @@ -183,8 +183,7 @@ DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpanmodel.decoder.inputs.attention_mask)} -{ + position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} { inputs_embeds_.Add(); position_inputs_->Add(); logits_.Add(); diff --git a/src/models/multi_modal.h b/src/models/multi_modal.h index 8f17b004b6..771f5be36d 100644 --- a/src/models/multi_modal.h +++ b/src/models/multi_modal.h @@ -107,8 +107,8 @@ struct DecoderState : State { Embeddings inputs_embeds_{*this, Embeddings::Mode::Input, // Model input model_.config_->model.decoder.inputs.embeddings}; std::unique_ptr position_inputs_; // Model input - DefaultKeyValueCache kv_cache_{*this}; // Model input - Logits logits_{*this}; // Model output + DefaultKeyValueCache kv_cache_{*this}; // Model input + Logits logits_{*this}; // Model output }; struct MultiModalPipelineState : State { diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp index 5ed02b751e..4e53b27a08 100644 --- a/src/models/position_inputs.cpp +++ b/src/models/position_inputs.cpp @@ -4,7 +4,7 @@ #include "model_type.h" #include #include -#include // For std::round +#include // For std::round namespace Generators { @@ -574,7 +574,7 @@ void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan std::span video_grid_thw_span; if (video_grid_thw_) { - video_grid_thw_span = std::span(video_grid_thw_->GetData(), video_grid_thw_->GetElementCount()) ; + video_grid_thw_span = std::span(video_grid_thw_->GetData(), video_grid_thw_->GetElementCount()); } std::span second_per_grid_ts_span; @@ -584,7 +584,7 @@ void Qwen2VLPositionInputs::CreateAndInitialize3DPositionIDs(DeviceSpan throw std::runtime_error("second_per_grid_ts must be float32."); second_per_grid_ts_span = std::span(second_per_grid_ts_->GetData(), second_per_grid_ts_->GetElementCount()); } - + auto input_ids_span = next_tokens.CpuSpan(); int image_index = 0; int video_index = 0; @@ -767,8 +767,8 @@ void Qwen2VLPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan for (int64_t s = 0; s < seq_len; ++s) { int64_t current_token_idx = b * seq_len + s; mask_data[current_token_idx] = (input_ids_span[current_token_idx] == model_.config_->model.pad_token_id) - ? static_cast(0) - : static_cast(1); + ? static_cast(0) + : static_cast(1); } } @@ -783,7 +783,7 @@ void Qwen2VLPositionInputs::Update3DPositionIDs(int base_pos) { // base_pos is cache_position[0] auto position_ids = OrtValue::CreateTensor(model_.allocator_cpu_, position_ids_shape_, type_); int64_t batch_size = position_ids_shape_[1]; // This is already expanded (batch*beams) - int64_t seq_len = position_ids_shape_[2]; // This will be 1 for generation + int64_t seq_len = position_ids_shape_[2]; // This will be 1 for generation if (rope_deltas_.size() != batch_size) { throw std::runtime_error("rope_deltas size mismatch with batch_size * num_beams."); diff --git a/src/models/qwen_image_processor.cpp b/src/models/qwen_image_processor.cpp index 3cd0190aff..32a4ca56e1 100644 --- a/src/models/qwen_image_processor.cpp +++ b/src/models/qwen_image_processor.cpp @@ -13,7 +13,7 @@ namespace { // constexpr int64_t kMergeSize = 2; // Qwen2-VL merge size for vision tokens std::tuple, std::unique_ptr> ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& prompt, - OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, + OrtxTensor* pixel_values, OrtxTensor* image_grid_thw, const int64_t* computed_grid_data, int64_t computed_grid_num_images, Ort::Allocator& allocator, int64_t spatial_merge_size) { constexpr char vision_start_token[] = "<|vision_start|>"; @@ -23,14 +23,14 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr int64_t num_images = 0; int64_t total_image_tokens = 0; const int64_t* image_grid_thw_data = nullptr; - + if (pixel_values) { const float* pixel_values_data{}; const int64_t* pixel_values_shape{}; size_t pixel_values_num_dims; CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast(&pixel_values_data), &pixel_values_shape, &pixel_values_num_dims)); - + // Get image_grid_thw data from either processor output or computed value if (image_grid_thw) { const int64_t* image_grid_thw_shape{}; @@ -42,7 +42,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr image_grid_thw_data = computed_grid_data; num_images = computed_grid_num_images; } - + // Calculate total image tokens based on grid dimensions // For each image: (temporal * height * width) / (merge_size^2) for (int64_t i = 0; i < num_images; ++i) { @@ -55,7 +55,7 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr // Generate input_ids with vision tokens std::string text = prompt; - + // If prompt is empty, add vision markers for each image if (text.empty()) { for (int64_t i = 0; i < num_images; ++i) { @@ -72,10 +72,10 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr const auto vision_start_begin = std::sregex_iterator(text.begin(), text.end(), vision_start_regex); const auto vision_start_end = std::sregex_iterator(); const auto vision_start_tokens = std::distance(vision_start_begin, vision_start_end); - + if (num_images != vision_start_tokens) { - throw std::runtime_error("Prompt contained " + std::to_string(vision_start_tokens) + - " vision_start tokens but received " + std::to_string(num_images) + " images."); + throw std::runtime_error("Prompt contained " + std::to_string(vision_start_tokens) + + " vision_start tokens but received " + std::to_string(num_images) + " images."); } // For Qwen2-VL, we need to replace vision markers with image_pad tokens @@ -84,34 +84,34 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr std::string modified_text; size_t last_pos = 0; size_t image_idx = 0; - + std::smatch match; std::string temp_text = text; while (std::regex_search(temp_text, match, vision_start_regex)) { // Add text before the vision_start token modified_text += text.substr(last_pos, match.position() - (last_pos - (text.size() - temp_text.size()))); - + // Calculate number of image_pad tokens for this image int64_t t = image_grid_thw_data[image_idx * 3 + 0]; int64_t h = image_grid_thw_data[image_idx * 3 + 1]; int64_t w = image_grid_thw_data[image_idx * 3 + 2]; int64_t num_pads = (t * h * w) / (spatial_merge_size * spatial_merge_size); - + // Add vision_start, image_pad tokens, and vision_end modified_text += vision_start_token; for (int64_t i = 0; i < num_pads; ++i) { modified_text += image_pad_token; } modified_text += vision_end_token; - + last_pos = match.position() + match.length() + (text.size() - temp_text.size()); - + // Find and skip vision_end token size_t vision_end_pos = text.find(vision_end_token, last_pos); if (vision_end_pos != std::string::npos) { last_pos = vision_end_pos + strlen(vision_end_token); } - + temp_text = match.suffix(); image_idx++; } @@ -165,40 +165,40 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token OrtxTensor* image_grid_thw = nullptr; // Try to get image_grid_thw from processor (second output) auto status = OrtxTensorResultGetAt(result.get(), 1, &image_grid_thw); - + // Get pixel_values data and shape const float* pixel_values_data{}; const int64_t* pixel_values_shape{}; size_t pixel_values_num_dims; CheckResult(OrtxGetTensorData(pixel_values, reinterpret_cast(&pixel_values_data), &pixel_values_shape, &pixel_values_num_dims)); - + // If processor doesn't provide image_grid_thw or patched pixel_values, compute them std::unique_ptr computed_image_grid_thw; std::unique_ptr patched_pixel_values; const int64_t* computed_grid_data = nullptr; int64_t computed_grid_num_images = 0; - + // Check if pixel_values needs patching (shape should be [1, height, width, channels] in HWC format) if (pixel_values_num_dims == 4 && pixel_values_shape[0] == 1) { constexpr int64_t kPatchSize = 14; constexpr int64_t kTemporalPatchSize = 2; constexpr int64_t kChannels = 3; - - int64_t height = pixel_values_shape[1]; // HWC: [batch, height, width, channels] + + int64_t height = pixel_values_shape[1]; // HWC: [batch, height, width, channels] int64_t width = pixel_values_shape[2]; int64_t channels = pixel_values_shape[3]; - + int64_t height_patches = height / kPatchSize; int64_t width_patches = width / kPatchSize; int64_t total_patches = height_patches * width_patches; int64_t patch_dim = channels * kTemporalPatchSize * kPatchSize * kPatchSize; - + // Create patched pixel_values: [total_patches, patch_dim] patched_pixel_values = OrtValue::CreateTensor( allocator, std::vector{total_patches, patch_dim}); auto* patched_data = patched_pixel_values->GetTensorMutableData(); - + // Extract patches from single image in HWC format // Each spatial patch is replicated kTemporalPatchSize times int64_t patch_idx = 0; @@ -206,9 +206,9 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token for (int64_t pw = 0; pw < width_patches; ++pw) { int64_t h_start = ph * kPatchSize; int64_t w_start = pw * kPatchSize; - + int64_t write_idx = patch_idx * patch_dim; - + // Repeat the same spatial patch kTemporalPatchSize times // Output: [temporal, channels, patch_h, patch_w] for (int64_t t = 0; t < kTemporalPatchSize; ++t) { @@ -225,26 +225,26 @@ std::unique_ptr QwenImageProcessor::Process(const Tokenizer& token patch_idx++; } } - + // Create image_grid_thw: [1, 3] for single image if (status != kOrtxOK || !image_grid_thw) { computed_image_grid_thw = OrtValue::CreateTensor( allocator, std::vector{1, 3}); auto* grid_data = computed_image_grid_thw->GetTensorMutableData(); - + // For a single image: T=1 (one frame), H=height_patches, W=width_patches // The kTemporalPatchSize is embedded in the patch dimension grid_data[0] = 1; // Single temporal frame for images grid_data[1] = height_patches; grid_data[2] = width_patches; - + computed_grid_data = grid_data; computed_grid_num_images = 1; } } - auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, - image_grid_thw, computed_grid_data, computed_grid_num_images, allocator, spatial_merge_size_); + auto [input_ids, num_img_tokens] = ProcessImagePrompt(tokenizer, prompt, pixel_values, + image_grid_thw, computed_grid_data, computed_grid_num_images, allocator, spatial_merge_size_); named_tensors->emplace(std::string(Config::Defaults::InputIdsName), std::make_shared(std::move(input_ids))); // Use patched pixel_values if we computed it, otherwise use processor output