diff --git a/cmake/deps.txt b/cmake/deps.txt index 7e50996352..2606531c85 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e -onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;245f6667babf9668b862ac4513c69ea95117c295 +onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;9f1f67d6d075793a0828b24e73d50803eb657e9a # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE) llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d diff --git a/examples/python/model-vision.py b/examples/python/model-vision.py index 1fd73a9db2..acd95f1ff1 100644 --- a/examples/python/model-vision.py +++ b/examples/python/model-vision.py @@ -5,6 +5,7 @@ import glob import json import os +import readline import time from pathlib import Path @@ -12,6 +13,20 @@ # og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) +# Tool-calling system prompt for Qwen/Fara models +FARA_SYSTEM_PROMPT = """You are a web agent trying to complete user tasks on websites using function calls. + +The functions at your disposal are: + +{"type": "function", "function": {"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer based on screenshots.\\n- This is an interface to a web browser. You do not have access to a terminal or applications menu, only the browser.\\n- Some pages, etc. may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click a home page icon and a window doesn't change, try wait and taking another screenshot.\\n- Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\\n- If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\\n- Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\\n- When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\\nScreen resolution: 1428x896", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\\n* `key`: Press keyboard keys, like \\"Enter\\", \\"Alt\\", \\"Shift\\", \\"Tab\\", \\"Control\\", \\"Backspace\\", \\"Delete\\", \\"Escape\\", etc. Keys are pressed down in the order given, then released in reverse order.\\n* `type`: Type a string of text on the keyboard.\\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\\n* `left_click`: Click the left mouse button.\\n* `scroll`: Performs a scroll of the mouse scroll wheel.\\n* `visit_url`: Visit a specified URL.\\n* `web_search`: Perform a web search with a specified query.\\n* `history_back`: Go back to the previous page in the browser history.\\n* `pause_and_memorize_fact`: Pause and memorize a fact for future reference.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "scroll", "visit_url", "web_search", "history_back", "pause_and_memorize_fact", "wait", "terminate"], "type": "string"}, "keys": {"description": "Keyboard keys to be pressed in order. Required only by `action=key`.", "type": "array"}, "text": {"description": "Text to type. Required only by `action=type`.", "type": "string"}, "press_enter": {"description": "Whether to press the 'Enter' key after typing. Required only by `action=type`.", "type": "boolean"}, "delete_existing_text": {"description": "Whether to delete existing text before typing. Required only by `action=type`.", "type": "boolean"}, "coordinate": {"description": "[x, y]: The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=left_click`, `action=mouse_move`, and `action=type`.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}, "url": {"description": "The URL to visit. Required only by `action=visit_url`.", "type": "string"}, "query": {"description": "The query to search for. Required only by `action=web_search`.", "type": "string"}, "fact": {"description": "The fact to remember for the future. Required only by `action=pause_and_memorize_fact`.", "type": "string"}, "time": {"description": "Number of seconds to wait. Required only by `action=wait`.", "type": "number"}, "status": {"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}}} + + +To make a function call, you should output a json object inside XML tags. The json object must contain the function name and its arguments, like this: + +{\\"name\\": , \\"arguments\\": } + +""" + def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): curr_path = Path(current_dir).absolute() @@ -26,10 +41,20 @@ def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): def _complete(text, state): - return (glob.glob(text + "*") + [None])[state] + return [*glob.glob(text + "*"), None][state] def run(args: argparse.Namespace): + if args.use_winml: + try: + import winml + + print(winml.register_execution_providers(ort=False, ort_genai=True)) + except ImportError: + print("WinML not available, using default execution providers") + except Exception as e: + print(f"Failed to register WinML execution providers: {e}") + print("Loading model...") config = og.Config(args.model_path) if args.execution_provider != "follow_config": @@ -49,8 +74,6 @@ def run(args: argparse.Namespace): while True: if interactive: try: - import readline - readline.set_completer_delims(" \t\n;") readline.parse_and_bind("tab: complete") readline.set_completer(_complete) @@ -80,7 +103,7 @@ def run(args: argparse.Namespace): if len(image_paths) == 0: print("No image provided") else: - for i, image_path in enumerate(image_paths): + for _, image_path in enumerate(image_paths): if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") print(f"Using image: {image_path}") @@ -101,6 +124,10 @@ def run(args: argparse.Namespace): # Combine all image tags and text into one user message content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text messages.append({"role": "user", "content": content}) + elif model.type in ["qwen2_5_vl", "fara"]: + messages.append({"role": "system", "content": FARA_SYSTEM_PROMPT}) + content = "".join(["<|vision_start|><|image_pad|><|vision_end|>" for _ in image_paths]) + text + messages.append({"role": "user", "content": content}) else: # Gemma3-style multimodal: structured content content_list = [{"type": "image"} for _ in image_paths] @@ -116,7 +143,8 @@ def run(args: argparse.Namespace): print("Generating response...") params = og.GeneratorParams(model) - params.set_search_options(max_length=7680) + max_length = args.max_length if args.max_length else 7680 + params.set_search_options(max_length=max_length) generator = og.Generator(model, params) generator.set_inputs(inputs) @@ -162,11 +190,24 @@ def run(args: argparse.Namespace): parser.add_argument( "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage" ) + parser.add_argument( + "--max_length", + type=int, + required=False, + default=None, + help="Maximum generation length. Defaults to model's context_length from config.", + ) parser.add_argument( "--non-interactive", action=argparse.BooleanOptionalAction, required=False, help="Non-interactive mode, mainly for CI usage", ) + parser.add_argument( + "--use-winml", + action="store_true", + required=False, + help="Register WinML execution providers before loading the model", + ) args = parser.parse_args() run(args) diff --git a/src/config.cpp b/src/config.cpp index 7087819d86..cd5fdf5cf2 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -587,6 +587,11 @@ struct Decoder_Element : JSON::Element { v_.sliding_window = Config::Model::Decoder::SlidingWindow{}; return sliding_window_; } + // Support object-style pipeline: "pipeline": { "embeddings": { ... }, ... } + if (name == "pipeline") { + pipeline_object_ = std::make_unique(v_.pipeline); + return *pipeline_object_; + } throw JSON::unknown_value_error{}; } @@ -605,6 +610,7 @@ struct Decoder_Element : JSON::Element { DecoderOutputs_Element outputs_{v_.outputs}; Pipeline_Element pipeline_{v_.pipeline}; SlidingWindow_Element sliding_window_{v_.sliding_window}; + std::unique_ptr pipeline_object_; // object-style pipeline support }; struct VisionInputs_Element : JSON::Element { @@ -615,6 +621,8 @@ struct VisionInputs_Element : JSON::Element { v_.pixel_values = JSON::Get(value); } else if (name == "image_sizes") { v_.image_sizes = JSON::Get(value); + } else if (name == "image_grid_thw") { + v_.image_grid_thw = JSON::Get(value); } else if (name == "attention_mask") { v_.attention_mask = JSON::Get(value); } else { @@ -641,6 +649,77 @@ struct VisionOutputs_Element : JSON::Element { Config::Model::Vision::Outputs& v_; }; +// Vision pipeline support structures +struct VisionPipelineModel_Element : JSON::Element { + explicit VisionPipelineModel_Element(Config::Model::Vision::PipelineModel& v) : v_{v} {} + + void OnValue(std::string_view name, JSON::Value value) override { + if (name == "filename") { + v_.filename = JSON::Get(value); + } else if (name == "run_on_cpu") { + v_.run_on_cpu = JSON::Get(value); + } else { + throw JSON::unknown_value_error{}; + } + } + + Element& OnObject(std::string_view name) override { + if (name == "session_options") { + v_.session_options = Config::SessionOptions{}; + session_options_ = std::make_unique(*v_.session_options); + return *session_options_; + } + if (name == "run_options") { + v_.run_options = Config::RunOptions{}; + run_options_ = std::make_unique(*v_.run_options); + return *run_options_; + } + throw JSON::unknown_value_error{}; + } + + Element& OnArray(std::string_view name) override { + if (name == "inputs") { + return inputs_; + } + if (name == "outputs") { + return outputs_; + } + throw JSON::unknown_value_error{}; + } + + private: + Config::Model::Vision::PipelineModel& v_; + std::unique_ptr session_options_; + std::unique_ptr run_options_; + StringArray_Element inputs_{v_.inputs}; + StringArray_Element outputs_{v_.outputs}; +}; + +struct VisionPipelineModelObject_Element : JSON::Element { + explicit VisionPipelineModelObject_Element(std::vector& v) : v_{v} {} + + Element& OnObject(std::string_view name) override { + auto& model = v_.emplace_back(); + model.model_id = name; + elements_.emplace_back(model); + return elements_.back(); + } + + private: + std::vector& v_; + std::vector elements_; +}; + +struct VisionPipeline_Element : JSON::Element { + explicit VisionPipeline_Element(std::vector& v) : v_{v} {} + + Element& OnObject(std::string_view name) override { return object_; } + + private: + std::vector& v_; + VisionPipelineModelObject_Element object_{v_}; +}; + struct Vision_Element : JSON::Element { explicit Vision_Element(Config::Model::Vision& v) : v_{v} {} @@ -673,6 +752,18 @@ struct Vision_Element : JSON::Element { if (name == "outputs") { return outputs_; } + // Support object-style pipeline for vision: "pipeline": { "patch_embed": { ... }, ... } + if (name == "pipeline") { + vision_pipeline_object_ = std::make_unique(v_.pipeline); + return *vision_pipeline_object_; + } + throw JSON::unknown_value_error{}; + } + + Element& OnArray(std::string_view name) override { + if (name == "pipeline") { + return pipeline_element_; + } throw JSON::unknown_value_error{}; } @@ -682,6 +773,8 @@ struct Vision_Element : JSON::Element { std::unique_ptr run_options_; VisionInputs_Element inputs_{v_.inputs}; VisionOutputs_Element outputs_{v_.outputs}; + VisionPipeline_Element pipeline_element_{v_.pipeline}; + std::unique_ptr vision_pipeline_object_; // object-style pipeline support }; struct SpeechInputs_Element : JSON::Element { @@ -1212,19 +1305,14 @@ void ClearDecoderProviderOptionsHardwareVendorId(Config& config, std::string_vie struct Root_Element : JSON::Element { explicit Root_Element(Config& config) : config_{config} {} - void OnValue(std::string_view name, JSON::Value value) override { + void OnValue(std::string_view /*name*/, JSON::Value /*value*/) override { + // No top-level scalar values currently supported } Element& OnObject(std::string_view name) override { - if (name == "model") { - return model_element_; - } - if (name == "search") { - return search_element_; - } - if (name == "engine") { - return engine_element_; - } + if (name == "model") return model_element_; + if (name == "search") return search_element_; + if (name == "engine") return engine_element_; throw JSON::unknown_value_error{}; } diff --git a/src/config.h b/src/config.h index 507d7c80c1..ccac038c12 100644 --- a/src/config.h +++ b/src/config.h @@ -159,9 +159,22 @@ struct Config { std::string config_filename{"processor_config.json"}; std::optional adapter_filename{}; + // Vision pipeline support (patch embed -> vision attn -> patch merger) + struct PipelineModel { + std::string filename; + std::optional session_options; + std::optional run_options; + std::string model_id; // Identifier used to link outputs to subsequent stages + std::vector inputs; // Graph input names + std::vector outputs; // Graph output names + bool run_on_cpu{false}; // If true force CPU EP when multiple EPs are configured + }; + std::vector pipeline; // Ordered pipeline models + struct Inputs { std::string pixel_values{Defaults::PixelValuesName}; std::string image_sizes{Defaults::ImageSizesName}; + std::string image_grid_thw{Defaults::ImageSizesName}; // Qwen2.5-VL uses image_grid_thw, defaults to image_sizes std::string attention_mask{Defaults::ImageAttentionMaskName}; // image attention mask } inputs; diff --git a/src/generators.cpp b/src/generators.cpp index 85cd5cd26a..d19751217c 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -318,14 +318,24 @@ DeviceSpan Generator::AllocateInputIdsOnDevice(cpu_span auto input_ids_device = state_->params_->p_device->Allocate(padded_input_ids_size); auto cpu_span = input_ids_device.CpuSpan(); - auto padding_begin = cpu_span.begin(); - auto data_end = cpu_span.end(); - if (model_->config_->model.decoder.sliding_window.has_value() && model_->config_->model.decoder.sliding_window->alignment == "left") { - padding_begin = cpu_span.begin() + input_ids.size(); - data_end = padding_begin; + + // Handle padding based on alignment setting for sliding window models + if (padded_input_ids_size > input_ids.size()) { + const bool left_align = model_->config_->model.decoder.sliding_window.has_value() && + model_->config_->model.decoder.sliding_window->alignment == "left"; + + if (left_align) { + // Left alignment: padding first, then data + std::fill_n(cpu_span.begin(), padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id); + std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin() + (padded_input_ids_size - input_ids.size())); + } else { + // Right alignment (default): data first, then padding + std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin()); + std::fill(cpu_span.begin() + input_ids.size(), cpu_span.end(), model_->config_->model.pad_token_id); + } + } else { + std::copy(input_ids.begin(), input_ids.end(), cpu_span.begin()); } - std::fill_n(padding_begin, padded_input_ids_size - input_ids.size(), model_->config_->model.pad_token_id); - std::copy_backward(input_ids.begin(), input_ids.end(), data_end); input_ids_device.CopyCpuToDevice(); return input_ids_device; } diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index ac12572a0c..b7a571d586 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -16,9 +16,9 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, DeviceSpan< : State{params, model}, model_{model}, kv_cache_(CreateKeyValueCache(*this)), - position_inputs_{model, *this, sequence_lengths_unk, model_.config_->model.decoder.inputs.attention_mask} { + position_inputs_{CreatePositionInputs(*this, sequence_lengths_unk, model_.config_->model.decoder.inputs.attention_mask)} { input_ids_.Add(); - position_inputs_.Add(); + position_inputs_->Add(); logits_.Add(); kv_cache_->Add(); } @@ -79,15 +79,35 @@ DeviceSpan DecoderOnly_State::RunWithChunking(int total_length, DeviceSpa } void DecoderOnly_State::RewindTo(size_t index) { - position_inputs_.RewindTo(index); + position_inputs_->RewindTo(index); kv_cache_->RewindTo(index); } void DecoderOnly_State::UpdateInputsOutputs(DeviceSpan& next_tokens, DeviceSpan beam_indices, int total_length) { input_ids_.Update(next_tokens); size_t new_length = static_cast(input_ids_.GetShape()[1]); - position_inputs_.Update(next_tokens, total_length, static_cast(new_length)); - kv_cache_->Update(beam_indices, total_length); + + // Determine effective lengths for position_ids and KV cache based on sliding window config + int position_length = total_length; + int kv_cache_length = total_length; + + if (model_.config_->model.decoder.sliding_window.has_value() && + model_.config_->model.decoder.sliding_window->window_size > 0) { + const int window_size = model_.config_->model.decoder.sliding_window->window_size; + + // Position IDs are clamped when slide_inputs is true + if (model_.config_->model.decoder.sliding_window->slide_inputs) { + position_length = std::min(total_length, window_size); + } + + // KV cache is clamped when slide_key_value_cache is true + if (model_.config_->model.decoder.sliding_window->slide_key_value_cache) { + kv_cache_length = std::min(total_length, window_size); + } + } + + position_inputs_->Update(next_tokens, position_length, static_cast(new_length)); + kv_cache_->Update(beam_indices, kv_cache_length); logits_.Update(next_tokens, new_length); } diff --git a/src/models/decoder_only.h b/src/models/decoder_only.h index a61fb2b8be..0869756ae2 100644 --- a/src/models/decoder_only.h +++ b/src/models/decoder_only.h @@ -36,7 +36,7 @@ struct DecoderOnly_State : State { DefaultInputIDs input_ids_{*this}; Logits logits_{*this}; std::unique_ptr kv_cache_; - DefaultPositionInputs position_inputs_; + std::unique_ptr position_inputs_; ExtraInputs extra_inputs_{*this}; }; diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp index 6e996192b9..497bd1b295 100644 --- a/src/models/decoder_only_pipeline.cpp +++ b/src/models/decoder_only_pipeline.cpp @@ -112,8 +112,8 @@ DecoderOnlyPipelineState::DecoderOnlyPipelineState(const DecoderOnlyPipelineMode DeviceSpan sequence_lengths, const GeneratorParams& params) : State{params, model}, - model_{model}, input_ids_{CreateInputIDs(*this)}, + model_{model}, key_value_cache_{CreateKeyValueCache(*this)}, do_key_value_cache_partial_update_{key_value_cache_ && key_value_cache_->IsPartialUpdateSupported()}, position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} { diff --git a/src/models/decoder_only_pipeline.h b/src/models/decoder_only_pipeline.h index 70160be432..17c173e711 100644 --- a/src/models/decoder_only_pipeline.h +++ b/src/models/decoder_only_pipeline.h @@ -69,6 +69,17 @@ struct DecoderOnlyPipelineState : State { void RunPipeline(int total_length, DeviceSpan& next_tokens, DeviceSpan next_indices, bool is_last_chunk); + protected: + // Virtual hook called after each pipeline stage completes, before next stage starts. + // Allows derived classes to modify stage outputs (e.g., inject vision embeddings). + // stage_id: ID of the stage that just completed + // next_tokens: current input tokens for pipeline + virtual void OnStageComplete(size_t stage_id, DeviceSpan& next_tokens) {} + + // Stores all the outputs from the previous pipeline state(s) + std::unordered_map> ortvalue_store_; + std::unique_ptr input_ids_; // Made protected for derived class access + private: void UpdateKeyValueCache(DeviceSpan beam_indices, int total_length); @@ -86,10 +97,6 @@ struct DecoderOnlyPipelineState : State { std::map pipeline_state_id_to_partial_kv_cache_update_record_idx_; std::vector partial_kv_cache_update_records_; - // Stores all the outputs from the previous pipeline state(s) - std::unordered_map> ortvalue_store_; - - std::unique_ptr input_ids_; Logits logits_{*this}; std::unique_ptr key_value_cache_; diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index dbdbd828d6..2d3a1f13be 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -270,13 +270,11 @@ void DefaultKeyValueCache::Update(DeviceSpan beam_indices, int total_le } if (!layer_shapes_.empty()) { - // Update per-layer shapes based on total_length, but respect max allocations + // Per-layer allocation with per-layer capacity constraints for (int layer_idx = 0; layer_idx < layer_count_; ++layer_idx) { - const int max_cache_length = static_cast(layer_shapes_[layer_idx][2]); - const int actual_length = std::min(total_length, max_cache_length); - std::array current_shape = layer_shapes_[layer_idx]; - current_shape[2] = actual_length; + const int max_cache_length = static_cast(layer_shapes_[layer_idx][2]); + current_shape[2] = std::min(total_length, max_cache_length); // Key tensor presents_[layer_idx * 2] = OrtValue::CreateTensor(Allocator(), current_shape, type_); @@ -287,7 +285,7 @@ void DefaultKeyValueCache::Update(DeviceSpan beam_indices, int total_le state_.outputs_[output_index_ + layer_idx * 2 + 1] = presents_[layer_idx * 2 + 1].get(); } } else { - // Uniform shape update (existing behavior) + // Uniform allocation shape_[2] = total_length; for (int i = 0; i < layer_count_ * 2; i++) { presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_); diff --git a/src/models/model.cpp b/src/models/model.cpp index 77e6c82657..adc355f970 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -19,6 +19,8 @@ #include "multi_modal.h" #include "marian.h" #include "decoder_only_pipeline.h" +#include "qwen_vl_model.h" +#include "qwen2_5_vl_image_processor.h" #include "../dml/interface.h" #if defined(_WIN32) @@ -1193,6 +1195,8 @@ std::shared_ptr CreateModel(OrtEnv& ort_env, const char* config_path, con } std::shared_ptr CreateModel(OrtEnv& ort_env, std::unique_ptr config) { + if (config->model.type == "fara" || config->model.type == "qwen2_5_vl") + return std::make_shared(std::move(config), ort_env); if (config->model.type == "gpt2") return std::make_shared(std::move(config), ort_env); if (ModelType::IsLLM(config->model.type)) @@ -1288,7 +1292,9 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess {"phi3v", Processor::Create}, {"whisper", Processor::Create}, {"phi4mm", Processor::Create}, - {"gemma3", Processor::Create}} { + {"gemma3", Processor::Create}, + {"fara", Processor::Create}, + {"qwen2_5_vl", Processor::Create}} { auto processor = processor_factory_.find(config.model.type); if (processor != processor_factory_.end()) { processor_ = processor->second(config, session_info); diff --git a/src/models/model_type.h b/src/models/model_type.h index c7c4d2f691..a21c4156cf 100644 --- a/src/models/model_type.h +++ b/src/models/model_type.h @@ -18,7 +18,7 @@ struct ModelType { inline static bool IsVLM(const std::string& model_type) { // Vision-language model (VLM) - static constexpr std::array VLM = {"gemma3", "phi3v"}; + static constexpr std::array VLM = {"fara", "gemma3", "phi3v", "qwen2_5_vl"}; return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end(); } diff --git a/src/models/qwen2_5_vl_image_processor.cpp b/src/models/qwen2_5_vl_image_processor.cpp new file mode 100644 index 0000000000..599ff946da --- /dev/null +++ b/src/models/qwen2_5_vl_image_processor.cpp @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "../generators.h" +#include "model.h" +#include "qwen2_5_vl_image_processor.h" +#include + +namespace Generators { + +Qwen2_5VLImageProcessor::Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info) { + const auto processor_config = (config.config_path / fs::path("processor_config.json")).string(); + if (!fs::exists(config.config_path / fs::path("processor_config.json"))) { + throw std::runtime_error("processor_config.json not found at: " + processor_config); + } + + CheckResult(OrtxCreateProcessor(processor_.ToBeAssigned(), processor_config.c_str())); + + auto input_names = session_info.GetInputNames(); + for (const auto& input_name : input_names) { + if (input_name.find("pixel_values") != std::string::npos) { + pixel_values_name_ = input_name; + } else if (input_name.find("image_grid_thw") != std::string::npos) { + image_grid_thw_name_ = input_name; + } + } +} + +std::unique_ptr Qwen2_5VLImageProcessor::Process(const Tokenizer& tokenizer, const Payload& payload) const { + if (!payload.images) { + throw std::runtime_error("No images provided to Qwen2.5VLImageProcessor"); + } + + std::string prompt = std::string(payload.prompt); + Ort::Allocator& allocator{Ort::Allocator::GetWithDefaultOptions()}; + auto named_tensors = std::make_unique(); + + const std::vector input_ids = tokenizer.Encode(prompt.c_str()); + std::unique_ptr input_ids_value = OrtValue::CreateTensor( + allocator, std::vector{1, static_cast(input_ids.size())}); + std::copy(input_ids.begin(), input_ids.end(), input_ids_value->GetTensorMutableData()); + named_tensors->emplace(Config::Defaults::InputIdsName, std::make_shared(std::move(input_ids_value))); + + // Run image preprocessing using onnxruntime-extensions + // This will execute the full pipeline from processor_config.json: + // DecodeImage -> ConvertRGB -> Resize (smart_resize) -> Rescale -> Normalize -> PatchImage + ort_extensions::OrtxObjectPtr result; + CheckResult(OrtxImagePreProcess(processor_.get(), payload.images->images_.get(), result.ToBeAssigned())); + + OrtxTensor* pixel_values = nullptr; + CheckResult(OrtxTensorResultGetAt(result.get(), 0, &pixel_values)); + + auto pixel_values_ortvalue = ProcessTensor(pixel_values, allocator); + named_tensors->emplace(pixel_values_name_, std::make_shared(std::move(pixel_values_ortvalue))); + + OrtxTensor* grid_thw_tensor = nullptr; + CheckResult(OrtxTensorResultGetAt(result.get(), 1, &grid_thw_tensor)); + + if (grid_thw_tensor == nullptr) { + throw std::runtime_error("grid_thw output not provided"); + } + + named_tensors->emplace(image_grid_thw_name_, std::make_shared(ProcessTensor(grid_thw_tensor, allocator))); + + return named_tensors; +} + +} // namespace Generators diff --git a/src/models/qwen2_5_vl_image_processor.h b/src/models/qwen2_5_vl_image_processor.h new file mode 100644 index 0000000000..85430dbb5d --- /dev/null +++ b/src/models/qwen2_5_vl_image_processor.h @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "model.h" +#include "processor.h" +#include "ortx_processor.h" + +namespace Generators { + +struct Qwen2_5VLImageProcessor : Processor { + Qwen2_5VLImageProcessor(Config& config, const SessionInfo& session_info); + + std::unique_ptr Process(const Tokenizer& tokenizer, const Payload& payload) const override; + + private: + ort_extensions::OrtxObjectPtr processor_; + std::string pixel_values_name_{"pixel_values"}; + std::string image_grid_thw_name_{"image_grid_thw"}; +}; + +} // namespace Generators diff --git a/src/models/qwen_vl_model.cpp b/src/models/qwen_vl_model.cpp new file mode 100644 index 0000000000..c1c8db2750 --- /dev/null +++ b/src/models/qwen_vl_model.cpp @@ -0,0 +1,206 @@ +#include "qwen_vl_model.h" +#include "model.h" +#include "onnxruntime_api.h" +#include "../logging.h" +#include +#include +#include + +namespace Generators { + +Qwen2_5_VL_PipelineModel::Qwen2_5_VL_PipelineModel(std::unique_ptr config, OrtEnv& ort_env) + : DecoderOnlyPipelineModel(std::move(config), ort_env) { + if (config_->model.vision.pipeline.empty()) return; + + // Find vision pipeline stage paths + auto find_stage = [&](const std::string& id) -> std::string { + for (const auto& stage : config_->model.vision.pipeline) { + if (stage.model_id == id) return (config_->config_path / fs::path(stage.filename)).string(); + } + return ""; + }; + + auto patch_embed_path = find_stage("patch_embed"); + auto vision_attn_path = find_stage("vision_attn"); + auto patch_merger_path = find_stage("patch_merger"); + + if (patch_embed_path.empty() || vision_attn_path.empty() || patch_merger_path.empty()) return; + + // Check if QNN should be used for vision attention + bool use_qnn_attn = std::any_of(config_->model.vision.pipeline.begin(), + config_->model.vision.pipeline.end(), + [](const auto& stage) { + return stage.model_id == "vision_attn" && !stage.run_on_cpu; + }); + + // Default spatial merge size + constexpr int spatial_merge = 2; + + vision_pipeline_ = std::make_unique( + ort_env, patch_embed_path, vision_attn_path, patch_merger_path, + spatial_merge, use_qnn_attn); +} + +std::unique_ptr Qwen2_5_VL_PipelineModel::CreateState(DeviceSpan sequence_lengths, + const GeneratorParams& params) const { + return std::make_unique(*this, sequence_lengths, params); +} + +Qwen2_5_VL_PipelineState::Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model, + DeviceSpan sequence_lengths, + const GeneratorParams& params) + : DecoderOnlyPipelineState(model, sequence_lengths, params), vl_model_{model} { +} + +void Qwen2_5_VL_PipelineState::SetExtraInputs(const std::vector& extra_inputs) { + DecoderOnlyPipelineState::SetExtraInputs(extra_inputs); + + if (vision_ran_ || !vl_model_.vision_pipeline_) return; + + OrtValue* pixel_values_val = nullptr; + OrtValue* image_grid_thw_val = nullptr; + const auto& pixel_name = vl_model_.config_->model.vision.inputs.pixel_values; + const auto& grid_thw_name = vl_model_.config_->model.vision.inputs.image_grid_thw; + + for (const auto& input : extra_inputs) { + if (input.name == pixel_name) { + pixel_values_val = input.tensor->GetOrtTensor(); + } else if (input.name == grid_thw_name) { + image_grid_thw_val = input.tensor->GetOrtTensor(); + } + } + if (!pixel_values_val) { + throw std::runtime_error("Vision pipeline: pixel_values input not found in extra_inputs"); + } + + auto pixel_type_info = pixel_values_val->GetTensorTypeAndShapeInfo(); + auto pixel_shape = pixel_type_info->GetShape(); + auto pixel_type = pixel_type_info->GetElementType(); + + std::vector pixel_shape_vec(pixel_shape.begin(), pixel_shape.end()); + const float* pixel_data = nullptr; + // Convert pixel values to float32 if needed (handles float16, bfloat16, float32) + std::unique_ptr pixel_values_fp32; + + if (pixel_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + pixel_data = pixel_values_val->GetTensorData(); + } else { + // Use existing Cast() function to convert to float32 + Cast(*pixel_values_val, pixel_values_fp32, *vl_model_.p_device_inputs_, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT); + pixel_data = pixel_values_fp32->GetTensorData(); + } + + if (!pixel_data) { + throw std::runtime_error("Vision pipeline: failed to access pixel_values tensor data"); + } + + // Extract grid_thw if provided + std::vector grid_thw; + if (image_grid_thw_val) { + auto grid_shape = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetShape(); + auto element_type = image_grid_thw_val->GetTensorTypeAndShapeInfo()->GetElementType(); + + if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { + const int64_t* grid_data = image_grid_thw_val->GetTensorData(); + size_t grid_count = 1; + for (auto dim : grid_shape) grid_count *= dim; + + // Expect [batch, 3] or [3] shape - take last 3 values as [t, h, w] + if (grid_count >= 3) { + grid_thw = {grid_data[grid_count - 3], grid_data[grid_count - 2], grid_data[grid_count - 1]}; + } + } else if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) { + const int32_t* grid_data = image_grid_thw_val->GetTensorData(); + size_t grid_count = 1; + for (auto dim : grid_shape) grid_count *= dim; + + if (grid_count >= 3) { + grid_thw = {static_cast(grid_data[grid_count - 3]), + static_cast(grid_data[grid_count - 2]), + static_cast(grid_data[grid_count - 1])}; + } + } + } + + try { + image_features_buffer_ = vl_model_.vision_pipeline_->Run(pixel_data, pixel_shape_vec, grid_thw); + } catch (const std::exception& e) { + throw std::runtime_error(std::string("Vision pipeline failed: ") + e.what()); + } + + auto out_shape = vl_model_.vision_pipeline_->GetLastOutputShape(); + if (out_shape.size() != 2) { + throw std::runtime_error("Vision pipeline: expected output shape rank 2, got " + std::to_string(out_shape.size())); + } + + auto mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); + std::span data_span(image_features_buffer_.data(), image_features_buffer_.size()); + std::span shape_span(out_shape.data(), out_shape.size()); + image_features_value_ = OrtValue::CreateTensor(*mem_info, data_span, shape_span); + + vision_ran_ = true; +} + +void Qwen2_5_VL_PipelineState::OnStageComplete(size_t stage_id, DeviceSpan& next_tokens) { + if (stage_id != 0 || !vision_ran_) return; + + const auto& embeddings_config = vl_model_.config_->model.decoder.pipeline[0]; + if (!embeddings_config.outputs.empty()) { + InjectVisionEmbeddings(embeddings_config.outputs[0], next_tokens); + } +} + +void Qwen2_5_VL_PipelineState::InjectVisionEmbeddings(const std::string& embeddings_output_name, + DeviceSpan& input_token_ids) { + auto it = ortvalue_store_.find(embeddings_output_name); + if (it == ortvalue_store_.end() || !it->second) { + throw std::runtime_error("Vision embedding injection: embeddings output '" + embeddings_output_name + "' not found in ortvalue_store"); + } + + OrtValue* embeddings_ortvalue = it->second.get(); + auto shape = embeddings_ortvalue->GetTensorTypeAndShapeInfo()->GetShape(); + float* embeddings_data = embeddings_ortvalue->GetTensorMutableData(); + + auto vision_shape = image_features_value_->GetTensorTypeAndShapeInfo()->GetShape(); + const float* vision_data = image_features_value_->GetTensorData(); + + const int64_t embedding_dim = shape[2]; + const int64_t num_vision_tokens = vision_shape[0]; + const int64_t vision_dim = vision_shape[1]; + if (vision_dim != embedding_dim) { + throw std::runtime_error("Vision embedding injection: dimension mismatch - vision_dim=" + std::to_string(vision_dim) + + ", embedding_dim=" + std::to_string(embedding_dim)); + } + + constexpr int32_t image_token_id = 151655; + + if (!input_ids_ || !input_ids_->Get()) { + throw std::runtime_error("Vision embedding injection: input_ids not available"); + } + + OrtValue* input_ids_ortvalue = input_ids_->Get(); + auto input_ids_shape = input_ids_ortvalue->GetTensorTypeAndShapeInfo()->GetShape(); + const int32_t* token_ids_cpu = input_ids_ortvalue->GetTensorData(); + + int64_t total_tokens = 1; + for (auto dim : input_ids_shape) total_tokens *= dim; + + for (int64_t i = 0; i < total_tokens; ++i) { + if (token_ids_cpu[i] == image_token_id && image_embed_consumed_ < static_cast(num_vision_tokens)) { + std::memcpy(embeddings_data + (i * embedding_dim), + vision_data + (image_embed_consumed_ * vision_dim), + vision_dim * sizeof(float)); + image_embed_consumed_++; + } + } + + // Warn if there's a mismatch between image tokens and vision features + if (image_embed_consumed_ != static_cast(num_vision_tokens)) { + Log("warning", "Vision embedding mismatch: consumed " + std::to_string(image_embed_consumed_) + + " of " + std::to_string(num_vision_tokens) + " available vision tokens. " + + "This may indicate a mismatch between the number of image placeholders in the prompt " + + "and the number of images provided."); + } +} + +} // namespace Generators diff --git a/src/models/qwen_vl_model.h b/src/models/qwen_vl_model.h new file mode 100644 index 0000000000..cc8dea7bb7 --- /dev/null +++ b/src/models/qwen_vl_model.h @@ -0,0 +1,43 @@ +#pragma once + +#include "decoder_only_pipeline.h" +#include "qwen_vl_vision.h" + +namespace Generators { + +// Qwen2.5-VL pipeline model integrating vision pipeline + decoder pipeline. +// Loads decoder pipeline sessions (handled by base) and constructs vision pipeline sessions. +// State runs vision once (on first SetExtraInputs when pixel_values arrives) to produce image_features +// which are injected into embeddings output via existing injection logic in DecoderOnlyPipelineState. +struct Qwen2_5_VL_PipelineModel : public DecoderOnlyPipelineModel { + Qwen2_5_VL_PipelineModel(std::unique_ptr config, OrtEnv& ort_env); + + std::unique_ptr CreateState(DeviceSpan sequence_lengths, + const GeneratorParams& params) const override; + + // Vision pipeline shared across states (sessions reused). + std::unique_ptr vision_pipeline_; +}; + +struct Qwen2_5_VL_PipelineState : public DecoderOnlyPipelineState { + Qwen2_5_VL_PipelineState(const Qwen2_5_VL_PipelineModel& model, + DeviceSpan sequence_lengths, + const GeneratorParams& params); + + void SetExtraInputs(const std::vector& extra_inputs) override; + + protected: + void OnStageComplete(size_t stage_id, DeviceSpan& next_tokens) override; + + private: + void InjectVisionEmbeddings(const std::string& embeddings_output_name, + DeviceSpan& input_token_ids); + + const Qwen2_5_VL_PipelineModel& vl_model_; + bool vision_ran_{false}; + std::unique_ptr image_features_value_; + std::vector image_features_buffer_; // backing storage for OrtValue + size_t image_embed_consumed_{0}; // Track how many vision embeddings we've injected +}; + +} // namespace Generators diff --git a/src/models/qwen_vl_vision.cpp b/src/models/qwen_vl_vision.cpp new file mode 100644 index 0000000000..ca267f00e5 --- /dev/null +++ b/src/models/qwen_vl_vision.cpp @@ -0,0 +1,295 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +// Qwen VL Vision pipeline implementation with optional QNN EP for vision attention stage. + +#include "qwen_vl_vision.h" +#include "../generators.h" + +#include +#include +#include +#include +#include + +namespace Generators { + +QwenVisionPipeline::QwenVisionPipeline(OrtEnv& env, + const std::string& patch_embed_model, + const std::string& vision_attn_model, + const std::string& patch_merger_model, + int64_t spatial_merge_size, + bool use_qnn_attn, + const std::string& qnn_backend_path, + int64_t patch_size, + int64_t window_size) + // Match declaration order to avoid MSVC C5038 warning-as-error + : use_qnn_attn_(use_qnn_attn), + qnn_backend_path_(qnn_backend_path), + spatial_merge_size_(spatial_merge_size), + patch_size_(patch_size), + window_size_(window_size), + env_(env) { + // Convert std::string model paths to ORTCHAR_T for cross-platform (char or wchar_t) + auto toOrtPath = [](const std::string& s) -> std::basic_string { + return std::basic_string(s.begin(), s.end()); + }; + auto pe_path = toOrtPath(patch_embed_model); + auto attn_path = toOrtPath(vision_attn_model); + auto merger_path = toOrtPath(patch_merger_model); + + // Patch embed and patch merger sessions (CPU for now) + patch_embed_session_ = OrtSession::Create(env_, pe_path.c_str(), nullptr); + patch_merger_session_ = OrtSession::Create(env_, merger_path.c_str(), nullptr); + + if (use_qnn_attn_) { + // Ensure QNN provider is available + auto so = OrtSessionOptions::Create(); + + so->SetIntraOpNumThreads(2).SetInterOpNumThreads(1); + + // QNN provider options + std::unordered_map qnn_options = { + {"backend_path", qnn_backend_path_}, + {"htp_performance_mode", "burst"}, + {"htp_graph_finalization_optimization_mode", "3"}, + {"soc_model", "60"}}; + + auto providers = Ort::GetAvailableProviders(); + bool has_qnn = std::find(providers.begin(), providers.end(), std::string("QNNExecutionProvider")) != providers.end(); + if (has_qnn) { + const char* keys[] = {"backend_path", "htp_performance_mode", "htp_graph_finalization_optimization_mode", "soc_model"}; + const char* values[] = {qnn_backend_path_.c_str(), "burst", "3", "60"}; + so->AppendExecutionProvider("QNNExecutionProvider", keys, values, 4); + } else { + // Use registered QNN EP - use GenAI wrapper APIs + auto ep_devices = GetOrtEnv().GetEpDevices(); + std::vector qnn_devices; + qnn_devices.reserve(ep_devices.size()); + + for (const auto* device : ep_devices) { + if (device->Name() == "QNNExecutionProvider") { + qnn_devices.push_back(device); + } + } + + if (qnn_devices.empty()) { + throw std::runtime_error("QNNExecutionProvider requested for vision attention but not registered."); + } + so->AppendExecutionProvider_V2(GetOrtEnv(), qnn_devices, qnn_options); + } + + vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), so.get()); + } else { + vision_attn_session_ = OrtSession::Create(env_, attn_path.c_str(), nullptr); + } +} + +std::unique_ptr QwenVisionPipeline::CreateTensor(const float* data, size_t count, const std::vector& shape) const { + auto memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); + std::span data_span(const_cast(data), count); + std::span shape_span(shape.data(), shape.size()); + return OrtValue::CreateTensor(*memory_info, data_span, shape_span); +} + +// Removed CreateEmptyTensor (previous implementation returned tensor with dangling backing store). + +std::vector QwenVisionPipeline::Run(const float* pixel_data, const std::vector& pixel_shape, + const std::vector& grid_thw) { + if (!patch_embed_session_ || !vision_attn_session_ || !patch_merger_session_) { + throw std::runtime_error("Vision pipeline sessions not initialized"); + } + + // Calculate window indices dynamically if grid_thw provided + if (!grid_thw.empty() && grid_thw.size() == 3) { + wnd_idx_ = CalculateWindowIndex(grid_thw[0], grid_thw[1], grid_thw[2]); + + // Build reverse index (argsort) + rev_idx_.resize(wnd_idx_.size()); + std::vector> pairs; + pairs.reserve(wnd_idx_.size()); + for (size_t i = 0; i < wnd_idx_.size(); ++i) pairs.emplace_back(wnd_idx_[i], i); + std::sort(pairs.begin(), pairs.end(), [](auto& a, auto& b) { return a.first < b.first; }); + for (size_t i = 0; i < pairs.size(); ++i) rev_idx_[i] = static_cast(pairs[i].second); + } + + size_t pixel_count = 1; + for (auto d : pixel_shape) pixel_count *= static_cast(d); + auto pixel_tensor = CreateTensor(pixel_data, pixel_count, pixel_shape); + + auto pe_in_name = patch_embed_session_->GetInputName(0); + const char* pe_input_names[] = {pe_in_name.c_str()}; + OrtValue* pe_inputs[] = {pixel_tensor.get()}; + + const int64_t num_patches = pixel_shape[1]; + const int64_t hidden_dim = 1280; + std::vector pe_out_shape{num_patches, hidden_dim}; + pe_out_buf_.resize(num_patches * hidden_dim); + auto pe_out_tensor = CreateTensor(pe_out_buf_.data(), pe_out_buf_.size(), pe_out_shape); + + auto pe_out_name = patch_embed_session_->GetOutputName(0); + const char* pe_output_names[] = {pe_out_name.c_str()}; + OrtValue* pe_outputs[] = {pe_out_tensor.get()}; + + patch_embed_session_->Run(nullptr, pe_input_names, pe_inputs, 1, pe_output_names, pe_outputs, 1); + + const int64_t seq_len = num_patches; + const int64_t window_area = spatial_merge_size_ * spatial_merge_size_; + const int64_t num_windows = seq_len / window_area; + + // Apply window reordering if indices available + reordered_buf_.resize(seq_len * hidden_dim); + + if (!wnd_idx_.empty()) { + // Validate window configuration + if (seq_len % window_area != 0 || static_cast(wnd_idx_.size()) != num_windows) { + throw std::runtime_error("Invalid window configuration for vision pipeline"); + } + + // Apply window reordering + for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) { + int64_t src_w = wnd_idx_[dst_w]; + if (src_w < 0 || src_w >= num_windows) throw std::runtime_error("wnd_idx value out of range"); + size_t offset_size = window_area * hidden_dim; + std::memcpy(reordered_buf_.data() + dst_w * offset_size, + pe_out_buf_.data() + src_w * offset_size, + offset_size * sizeof(float)); + } + } else { + // No window reordering - use sequential order + std::memcpy(reordered_buf_.data(), pe_out_buf_.data(), seq_len * hidden_dim * sizeof(float)); + } + + // Check if vision_attn session expects a different sequence length (fixed shape model) + auto attn_input_info = vision_attn_session_->GetInputTypeInfo(0); + auto& attn_input_tensor_info = attn_input_info->GetTensorTypeAndShapeInfo(); + auto attn_expected_shape = attn_input_tensor_info.GetShape(); + + int64_t expected_seq_len = (attn_expected_shape.size() >= 2 && attn_expected_shape[0] > 0) ? attn_expected_shape[0] : seq_len; + int64_t actual_seq_len = seq_len; // Mutable copy for padding adjustments + + if (expected_seq_len != seq_len) { + // Model expects fixed sequence length - need to pad or error + if (expected_seq_len > seq_len) { + // Pad the reordered buffer with zeros to match model's expected size + reordered_buf_.resize(expected_seq_len * hidden_dim, 0.0f); + actual_seq_len = expected_seq_len; // Update actual_seq_len for subsequent operations + } else { + // Model expects smaller input - this is an error (image too large for fixed-shape model) + throw std::runtime_error("Vision attention model input size mismatch"); + } + } + + std::vector attn_shape{actual_seq_len, hidden_dim}; + auto attn_in_tensor = CreateTensor(reordered_buf_.data(), reordered_buf_.size(), attn_shape); + auto attn_in_name = vision_attn_session_->GetInputName(0); + const char* attn_input_names[] = {attn_in_name.c_str()}; + OrtValue* attn_inputs[] = {attn_in_tensor.get()}; + + attn_out_buf_.resize(actual_seq_len * hidden_dim); + auto attn_out_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape); + auto attn_out_name = vision_attn_session_->GetOutputName(0); + const char* attn_output_names[] = {attn_out_name.c_str()}; + OrtValue* attn_outputs[] = {attn_out_tensor.get()}; + + vision_attn_session_->Run(nullptr, attn_input_names, attn_inputs, 1, attn_output_names, attn_outputs, 1); + + auto merger_in_tensor = CreateTensor(attn_out_buf_.data(), attn_out_buf_.size(), attn_shape); + auto merger_in_name = patch_merger_session_->GetInputName(0); + const char* merger_input_names[] = {merger_in_name.c_str()}; + OrtValue* merger_inputs[] = {merger_in_tensor.get()}; + + const int64_t merged_seq_len = actual_seq_len / window_area; // One token per window after merging + const int64_t merged_hidden = 3584; + std::vector merger_shape{merged_seq_len, merged_hidden}; + merger_out_buf_.resize(merged_seq_len * merged_hidden); + auto merger_out_tensor = CreateTensor(merger_out_buf_.data(), merger_out_buf_.size(), merger_shape); + auto merger_out_name = patch_merger_session_->GetOutputName(0); + const char* merger_output_names[] = {merger_out_name.c_str()}; + OrtValue* merger_outputs[] = {merger_out_tensor.get()}; + + patch_merger_session_->Run(nullptr, merger_input_names, merger_inputs, 1, merger_output_names, merger_outputs, 1); + + final_embeddings_buf_.resize(merger_out_buf_.size()); + + if (!rev_idx_.empty()) { + // Apply reverse reordering + if (static_cast(rev_idx_.size()) != num_windows) { + throw std::runtime_error("Vision pipeline reverse index size mismatch"); + } + for (int64_t dst_w = 0; dst_w < num_windows; ++dst_w) { + std::memcpy(final_embeddings_buf_.data() + dst_w * merged_hidden, + merger_out_buf_.data() + rev_idx_[dst_w] * merged_hidden, + merged_hidden * sizeof(float)); + } + } else { + // No reverse reordering - use sequential order + std::memcpy(final_embeddings_buf_.data(), merger_out_buf_.data(), + merger_out_buf_.size() * sizeof(float)); + } + + last_seq_len_ = merged_seq_len; + last_hidden_size_ = merged_hidden; + return final_embeddings_buf_; +} + +// Calculate window indices dynamically based on grid dimensions +// Matches HuggingFace transformers implementation: +// https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L367 +std::vector QwenVisionPipeline::CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w) { + // Calculate LLM grid dimensions after spatial merging + int64_t llm_grid_h = grid_h / spatial_merge_size_; + int64_t llm_grid_w = grid_w / spatial_merge_size_; + + // Calculate window size at the merged resolution + int64_t vit_merger_window_size = window_size_ / spatial_merge_size_ / patch_size_; + + // Calculate padding needed to fit into windows + int64_t pad_h = (vit_merger_window_size - (llm_grid_h % vit_merger_window_size)) % vit_merger_window_size; + int64_t pad_w = (vit_merger_window_size - (llm_grid_w % vit_merger_window_size)) % vit_merger_window_size; + + int64_t num_windows_h = (llm_grid_h + pad_h) / vit_merger_window_size; + int64_t num_windows_w = (llm_grid_w + pad_w) / vit_merger_window_size; + + std::vector window_index; + window_index.reserve(grid_t * llm_grid_h * llm_grid_w); + + // Create initial index grid + std::vector index(grid_t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w), -100); + + // Fill non-padded positions with sequential indices + for (int64_t t = 0; t < grid_t; ++t) { + for (int64_t h = 0; h < llm_grid_h; ++h) { + for (int64_t w = 0; w < llm_grid_w; ++w) { + int64_t idx = t * llm_grid_h * llm_grid_w + h * llm_grid_w + w; + int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w; + index[padded_idx] = idx; + } + } + } + + // Reshape into windows: (grid_t, num_windows_h, window_size, num_windows_w, window_size) + // Then permute to (grid_t, num_windows_h, num_windows_w, window_size, window_size) + // This groups patches by window instead of by spatial position + for (int64_t t = 0; t < grid_t; ++t) { + for (int64_t wh = 0; wh < num_windows_h; ++wh) { + for (int64_t ww = 0; ww < num_windows_w; ++ww) { + for (int64_t ph = 0; ph < vit_merger_window_size; ++ph) { + for (int64_t pw = 0; pw < vit_merger_window_size; ++pw) { + int64_t h = wh * vit_merger_window_size + ph; + int64_t w = ww * vit_merger_window_size + pw; + int64_t padded_idx = t * (llm_grid_h + pad_h) * (llm_grid_w + pad_w) + h * (llm_grid_w + pad_w) + w; + + // Only add non-padded indices + if (index[padded_idx] != -100) { + window_index.push_back(index[padded_idx]); + } + } + } + } + } + } + + return window_index; +} + +} // namespace Generators diff --git a/src/models/qwen_vl_vision.h b/src/models/qwen_vl_vision.h new file mode 100644 index 0000000000..5db7c17e95 --- /dev/null +++ b/src/models/qwen_vl_vision.h @@ -0,0 +1,75 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include + +#include "onnxruntime_api.h" + +namespace Generators { + +// Internal vision pipeline (no external DLL interface required after Python binding removal). +struct QwenVisionPipeline { + QwenVisionPipeline(OrtEnv& env, + const std::string& patch_embed_model, + const std::string& vision_attn_model, + const std::string& patch_merger_model, + int64_t spatial_merge_size, + bool use_qnn_attn = false, + const std::string& qnn_backend_path = "QnnHtp.dll", + int64_t patch_size = 14, + int64_t window_size = 56); + bool use_qnn_attn_{}; + std::string qnn_backend_path_{}; + + QwenVisionPipeline(const QwenVisionPipeline&) = delete; + QwenVisionPipeline& operator=(const QwenVisionPipeline&) = delete; + + // Run vision pipeline. + // pixel_values: float32 tensor with shape [S, C] or [B, C, H, W] depending on export (caller provides shape). + // grid_thw: optional grid dimensions [temporal, height, width] for dynamic window indexing + // The ONNX model is assumed to accept the provided shape directly as 'pixel_values'. + // Returns final merged embeddings (shape: [num_image_tokens, hidden_size]). + std::vector Run(const float* pixel_data, const std::vector& pixel_shape, + const std::vector& grid_thw = {}); + + // Shape info from last Run (seq_len, hidden_size). Returns empty vector if Run not called yet. + std::vector GetLastOutputShape() const { + if (last_seq_len_ <= 0 || last_hidden_size_ <= 0) return {}; + return {last_seq_len_, last_hidden_size_}; + } + + private: + // Internal helpers + std::unique_ptr CreateTensor(const float* data, size_t count, const std::vector& shape) const; + + // Calculate window indices dynamically based on grid dimensions + // Returns window_index (reordering indices for windowing) + std::vector CalculateWindowIndex(int64_t grid_t, int64_t grid_h, int64_t grid_w); + + std::unique_ptr patch_embed_session_; + std::unique_ptr vision_attn_session_; + std::unique_ptr patch_merger_session_; + + std::vector wnd_idx_; // window reordering indices (computed dynamically) + std::vector rev_idx_; // reverse ordering indices (argsort of wnd_idx) + int64_t spatial_merge_size_{}; + int64_t patch_size_{14}; // Vision patch size (typically 14) + int64_t window_size_{56}; // Window size for attention (typically 56) + OrtEnv& env_; + int64_t last_seq_len_{0}; + int64_t last_hidden_size_{0}; + + // Reusable buffers to avoid repeated allocation/deallocation + mutable std::vector pe_out_buf_; + mutable std::vector reordered_buf_; + mutable std::vector attn_out_buf_; + mutable std::vector merger_out_buf_; + mutable std::vector final_embeddings_buf_; +}; + +} // namespace Generators