Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;245f6667babf9668b862ac4513c69ea95117c295
onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;301b442d8f903daba129e825cd446755b840abb0

# These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d
Expand Down
12 changes: 12 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,8 @@ struct VisionInputs_Element : JSON::Element {
v_.pixel_values = JSON::Get<std::string_view>(value);
} else if (name == "image_sizes") {
v_.image_sizes = JSON::Get<std::string_view>(value);
} else if (name == "image_grid_thw") {
v_.image_grid_thw = JSON::Get<std::string_view>(value);
} else if (name == "attention_mask") {
v_.attention_mask = JSON::Get<std::string_view>(value);
} else {
Expand Down Expand Up @@ -651,6 +653,10 @@ struct Vision_Element : JSON::Element {
v_.config_filename = JSON::Get<std::string_view>(value);
} else if (name == "adapter_filename") {
v_.adapter_filename = JSON::Get<std::string_view>(value);
} else if (name == "spatial_merge_size") {
v_.spatial_merge_size = static_cast<int>(JSON::Get<double>(value));
} else if (name == "tokens_per_second") {
v_.tokens_per_second = static_cast<float>(JSON::Get<double>(value));
} else {
throw JSON::unknown_value_error{};
}
Expand Down Expand Up @@ -856,6 +862,12 @@ struct Model_Element : JSON::Element {
v_.decoder_start_token_id = static_cast<int>(JSON::Get<double>(value));
} else if (name == "sep_token_id") {
v_.sep_token_id = static_cast<int>(JSON::Get<double>(value));
} else if (name == "image_token_id") {
v_.image_token_id = static_cast<int>(JSON::Get<double>(value));
} else if (name == "video_token_id") {
v_.video_token_id = static_cast<int>(JSON::Get<double>(value));
} else if (name == "vision_start_token_id") {
v_.vision_start_token_id = static_cast<int>(JSON::Get<double>(value));
} else {
throw JSON::unknown_value_error{};
}
Expand Down
12 changes: 12 additions & 0 deletions src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct Config {
// Vision encoder names
static constexpr std::string_view PixelValuesName = "pixel_values";
static constexpr std::string_view ImageSizesName = "image_sizes";
static constexpr std::string_view ImageGridThwName = "image_grid_thw";
static constexpr std::string_view ImageAttentionMaskName = "image_attention_mask";
static constexpr std::string_view ImageFeaturesName = "image_features";
static constexpr std::string_view NumImageTokens = "num_image_tokens";
Expand Down Expand Up @@ -106,6 +107,12 @@ struct Config {
int bos_token_id{}; // The id of the beginning-of-stream token.
int sep_token_id{}; // The id of the separation token.
int decoder_start_token_id{}; // If an encoder-decoder model starts decoding with a different token than bos, the id of that token.

// Qwen2-VL specific token IDs
int image_token_id{};
int video_token_id{};
int vision_start_token_id{};

int vocab_size{};
int context_length{};

Expand Down Expand Up @@ -159,9 +166,14 @@ struct Config {
std::string config_filename{"processor_config.json"};
std::optional<std::string> adapter_filename{};

// Qwen2-VL specific vision config values
int spatial_merge_size{2};
float tokens_per_second{2.0f};

struct Inputs {
std::string pixel_values{Defaults::PixelValuesName};
std::string image_sizes{Defaults::ImageSizesName};
std::string image_grid_thw{Defaults::ImageGridThwName};
std::string attention_mask{Defaults::ImageAttentionMaskName}; // image attention mask
} inputs;

Expand Down
4 changes: 3 additions & 1 deletion src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1288,7 +1288,9 @@ MultiModalProcessor::MultiModalProcessor(Config& config, const SessionInfo& sess
{"phi3v", Processor::Create<PhiImageProcessor>},
{"whisper", Processor::Create<WhisperProcessor>},
{"phi4mm", Processor::Create<PhiMultiModalProcessor>},
{"gemma3", Processor::Create<GemmaImageProcessor>}} {
{"gemma3", Processor::Create<GemmaImageProcessor>},
{"qwen2vl", Processor::Create<QwenImageProcessor>},
{"qwen2_5_vl", Processor::Create<QwenImageProcessor>}} {
auto processor = processor_factory_.find(config.model.type);
if (processor != processor_factory_.end()) {
processor_ = processor->second(config, session_info);
Expand Down
3 changes: 2 additions & 1 deletion src/models/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "whisper_processor.h"
#include "phi_multimodal_processor.h"
#include "gemma_image_processor.h"
#include "qwen_image_processor.h"
#include "adapters.h"
#include "extra_outputs.h"

Expand Down Expand Up @@ -176,4 +177,4 @@ struct Model : std::enable_shared_from_this<Model>, LeakChecked<Model>, External
std::map<std::string, std::unique_ptr<OrtSessionOptions>> pipeline_session_options_;
};

} // namespace Generators
} // namespace Generators
7 changes: 6 additions & 1 deletion src/models/model_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,15 @@ struct ModelType {

inline static bool IsVLM(const std::string& model_type) {
// Vision-language model (VLM)
static constexpr std::array<std::string_view, 2> VLM = {"gemma3", "phi3v"};
static constexpr std::array<std::string_view, 4> VLM = {"gemma3", "phi3v", "qwen2vl", "qwen2_5_vl"};
return std::find(VLM.begin(), VLM.end(), model_type) != VLM.end();
}

inline static bool IsQwen2VL(const std::string& model_type) {
// Qwen2-VL specific check for 3D position IDs
return model_type == "qwen2vl" || model_type == "qwen2_5_vl";
}

inline static bool IsALM(const std::string& model_type) {
// Audio-language model (ALM)
static constexpr std::array<std::string_view, 1> ALM = {"whisper"};
Expand Down
38 changes: 33 additions & 5 deletions src/models/multi_modal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "../generators.h"
#include "multi_modal.h"
#include <numeric>

namespace Generators {

Expand Down Expand Up @@ -178,12 +179,13 @@ DeviceSpan<float> EmbeddingState::Run(int current_length, DeviceSpan<int32_t>& n
return {};
}

DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params)
DecoderState::DecoderState(const MultiModalLanguageModel& model, DeviceSpan<int32_t> sequence_lengths,
const GeneratorParams& params)
: State{params, model},
model_{model},
position_inputs_{model, *this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask} {
position_inputs_{CreatePositionInputs(*this, sequence_lengths, model_.config_->model.decoder.inputs.attention_mask)} {
inputs_embeds_.Add();
position_inputs_.Add();
position_inputs_->Add();
logits_.Add();
kv_cache_.Add();
}
Expand All @@ -201,7 +203,14 @@ DeviceSpan<float> DecoderState::Run(int current_length, DeviceSpan<int32_t>& nex
void DecoderState::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int total_length, DeviceSpan<int32_t> beam_indices) {
int batch_size = static_cast<int>(inputs_embeds_.GetShape()[0]);
size_t new_length = next_tokens.size() / batch_size;
position_inputs_.Update(next_tokens, total_length, static_cast<int>(new_length));
position_inputs_->Update(next_tokens, total_length, static_cast<int>(new_length));
kv_cache_.Update(beam_indices, total_length);
logits_.Update(next_tokens, new_length);
inputs_embeds_.UpdateSequenceLength(new_length);
}

// Overload for pipeline to call
void DecoderState::UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int total_length, DeviceSpan<int32_t> beam_indices, size_t new_length) {
kv_cache_.Update(beam_indices, total_length);
logits_.Update(next_tokens, new_length);
inputs_embeds_.UpdateSequenceLength(new_length);
Expand Down Expand Up @@ -243,6 +252,25 @@ void MultiModalPipelineState::SetExtraInputs(const std::vector<ExtraInput>& extr
speech_state_->SetExtraInputs(extra_inputs, num_audio_tokens_);
}
embedding_state_->SetExtraInputs(num_images_, num_image_tokens_, num_audio_tokens_);

// Set the grid tensors for Qwen2-VL if present
if (auto* qwen_pos_inputs = dynamic_cast<Qwen2VLPositionInputs*>(decoder_state_->position_inputs_.get())) {
std::shared_ptr<Tensor> img_grid, vid_grid, sec_grid;

for (const auto& input : extra_inputs) {
if (input.name == Config::Defaults::ImageGridThwName) {
img_grid = input.tensor;
} else if (input.name == "video_grid_thw") {
vid_grid = input.tensor;
} else if (input.name == "second_per_grid_ts") {
sec_grid = input.tensor;
}
}

if (img_grid || vid_grid) {
qwen_pos_inputs->SetGridTensors(img_grid, vid_grid, sec_grid);
}
}
}

DeviceSpan<float> MultiModalPipelineState::Run(int current_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {
Expand Down Expand Up @@ -357,4 +385,4 @@ OrtValue* MultiModalPipelineState::GetOutput(const char* name) {
return State::GetOutput(name);
};

} // namespace Generators
} // namespace Generators
13 changes: 7 additions & 6 deletions src/models/multi_modal.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ struct MultiModalLanguageModel : Model {
MultiModalLanguageModel(const MultiModalLanguageModel&) = delete;
MultiModalLanguageModel& operator=(const MultiModalLanguageModel&) = delete;

std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params) const;
std::unique_ptr<State> CreateState(DeviceSpan<int32_t> sequence_lengths, const GeneratorParams& params) const override;

std::unique_ptr<OrtSession> vision_session_; // pixel_values, [image_attention_mask], image_sizes -> image_features
std::unique_ptr<OrtSession> speech_session_; // audio_embeds, audio_sizes, audio_projection_mode -> audio_features
Expand Down Expand Up @@ -96,18 +96,19 @@ struct DecoderState : State {
DecoderState& operator=(const DecoderState&) = delete;

DeviceSpan<float> Run(int current_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) override;
void UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int current_length, DeviceSpan<int32_t> beam_indices);

private:
friend struct MultiModalPipelineState;

void UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int current_length, DeviceSpan<int32_t> beam_indices);
void UpdateInputsOutputs(DeviceSpan<int32_t>& next_tokens, int current_length, DeviceSpan<int32_t> beam_indices, size_t new_length);

const MultiModalLanguageModel& model_;
Embeddings inputs_embeds_{*this, Embeddings::Mode::Input, // Model input
model_.config_->model.decoder.inputs.embeddings};
DefaultPositionInputs position_inputs_; // Model input
DefaultKeyValueCache kv_cache_{*this}; // Model input
Logits logits_{*this}; // Model output
std::unique_ptr<PositionInputs> position_inputs_; // Model input
DefaultKeyValueCache kv_cache_{*this}; // Model input
Logits logits_{*this}; // Model output
};

struct MultiModalPipelineState : State {
Expand Down Expand Up @@ -144,4 +145,4 @@ struct MultiModalPipelineState : State {
const std::string speech_adapter_name_{"speech"};
};

} // namespace Generators
} // namespace Generators
Loading
Loading