Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions extension/llm/runner/multimodal_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,12 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner
executorch::extension::TensorPtr& tokens,
int64_t start_pos) override {
// run token embedding
auto token_embedding_outputs =
ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens));
auto token_embedding_result =
module_->execute(kTokenEmbeddingMethod, tokens);
if (!token_embedding_result.ok()) {
return token_embedding_result.error();
}
auto token_embedding_outputs = std::move(*token_embedding_result);

// Return the logits tensor
return decode(token_embedding_outputs[0], start_pos);
Expand All @@ -47,8 +51,12 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner
auto start_pos_tensor = ::executorch::extension::from_blob(
&start_pos, {1}, executorch::aten::ScalarType::Long);
// run text model
auto outputs_res = ET_UNWRAP(
module_->execute(kTextModelMethod, {embeddings, start_pos_tensor}));
auto outputs_result =
module_->execute(kTextModelMethod, {embeddings, start_pos_tensor});
if (!outputs_result.ok()) {
return outputs_result.error();
}
auto outputs_res = std::move(*outputs_result);

ET_CHECK_MSG(
outputs_res.size() == 1,
Expand Down
80 changes: 54 additions & 26 deletions extension/llm/runner/multimodal_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,21 @@ Result<uint64_t> MultimodalPrefiller::prefill(
if (input.is_image()) {
const Image& image = input.get_image();

auto method_meta = ET_UNWRAP(
module_->method_meta(kVisionEncoderMethod),
auto method_meta_result = module_->method_meta(kVisionEncoderMethod);
ET_CHECK_OK_OR_RETURN_ERROR(
method_meta_result.error(),
"Failed to get method_meta for %s",
kVisionEncoderMethod);
auto method_meta = method_meta_result.get();

ET_CHECK_OR_RETURN_ERROR(
method_meta.num_inputs() > 0,
InvalidArgument,
"Image encoder should have at least 1 input");
auto input_meta = ET_UNWRAP(
method_meta.input_tensor_meta(0),
"Cannot get input tensor meta at index 0");
auto input_meta_result = method_meta.input_tensor_meta(0);
ET_CHECK_OK_OR_RETURN_ERROR(
input_meta_result.error(), "Cannot get input tensor meta at index 0");
auto input_meta = input_meta_result.get();
auto expected_dtype = input_meta.scalar_type();

if (expected_dtype == ::executorch::aten::ScalarType::Float) {
Expand All @@ -77,47 +80,58 @@ Result<uint64_t> MultimodalPrefiller::prefill(
// The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
// tensor (CHW). Add a batch dimension of 1 if needed.
auto expected_dims = input_meta.sizes();
auto image_tensor = ET_UNWRAP(
image.toTensor(/*with_batch*/ expected_dims.size() == 4),
"Failed to convert image to tensor");
auto image_tensor_result =
image.toTensor(/*with_batch*/ expected_dims.size() == 4);
ET_CHECK_OK_OR_RETURN_ERROR(
image_tensor_result.error(), "Failed to convert image to tensor");
auto image_tensor = image_tensor_result.get();
ET_LOG(
Info,
"Image tensor dim: %zu, dtype: %s",
image_tensor->dim(),
::executorch::runtime::toString(image_tensor->scalar_type()));
// Run image encoder
auto image_encoder_outputs =
ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
auto image_encoder_result =
module_->execute(kVisionEncoderMethod, image_tensor);
ET_CHECK_OK_OR_RETURN_ERROR(image_encoder_result.error());
auto image_encoder_outputs = image_encoder_result.get();

encoder_output = image_encoder_outputs[0];
} else if (input.is_audio()) {
const Audio& audio = input.get_audio();

auto method_meta = ET_UNWRAP(
module_->method_meta(kAudioEncoderMethod),
auto method_meta_result = module_->method_meta(kAudioEncoderMethod);
ET_CHECK_OK_OR_RETURN_ERROR(
method_meta_result.error(),
"Failed to get method_meta for %s",
kAudioEncoderMethod);
auto method_meta = method_meta_result.get();

ET_CHECK_OR_RETURN_ERROR(
method_meta.num_inputs() > 0,
InvalidArgument,
"Audio encoder should have at least 1 input");
auto input_meta = ET_UNWRAP(
method_meta.input_tensor_meta(0),
"Cannot get input tensor meta at index 0");
auto input_meta_result = method_meta.input_tensor_meta(0);
ET_CHECK_OK_OR_RETURN_ERROR(
input_meta_result.error(), "Cannot get input tensor meta at index 0");
auto input_meta = input_meta_result.get();
auto expected_dtype = input_meta.scalar_type();

// Create tensor with original dtype
auto audio_tensor =
ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
auto audio_tensor_result = audio.toTensor();
ET_CHECK_OK_OR_RETURN_ERROR(
audio_tensor_result.error(), "Failed to convert audio to tensor");
auto audio_tensor = audio_tensor_result.get();

// Convert to expected dtype if needed
if (audio_tensor->scalar_type() != expected_dtype) {
if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
// Convert to bfloat16
audio_tensor = ET_UNWRAP(
convert_to_bfloat16(audio_tensor),
auto convert_result = convert_to_bfloat16(audio_tensor);
ET_CHECK_OK_OR_RETURN_ERROR(
convert_result.error(),
"Failed to convert audio tensor to bfloat16");
audio_tensor = convert_result.get();
} else {
ET_CHECK_OR_RETURN_ERROR(
false,
Expand Down Expand Up @@ -147,7 +161,15 @@ Result<uint64_t> MultimodalPrefiller::prefill(
std::vector<uint64_t> tokens;
if (input.is_text()) {
auto& text = input.get_text();
tokens = ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
auto encode_result = tokenizer_->encode(text);
if (!encode_result.ok()) {
ET_LOG(
Error,
"Tokenizers error code %d",
static_cast<uint32_t>(encode_result.error()));
return ::executorch::runtime::Error::InvalidArgument;
}
tokens = std::move(*encode_result);
} else {
tokens = input.get_tokens();
}
Expand All @@ -158,8 +180,10 @@ Result<uint64_t> MultimodalPrefiller::prefill(
::executorch::aten::ScalarType::Long);

// Run text encoder (token embeddings)
auto token_embedding_outputs =
ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor));
auto token_embedding_result =
module_->execute(kTokenEmbeddingMethod, text_tensor);
ET_CHECK_OK_OR_RETURN_ERROR(token_embedding_result.error());
auto token_embedding_outputs = token_embedding_result.get();

encoder_output = token_embedding_outputs[0];
} else {
Expand All @@ -180,8 +204,10 @@ Result<uint64_t> MultimodalPrefiller::prefill(
}
std::vector<int64_t> cache_positions;

auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, seq_len, kTextModelMethod));
auto cache_position_result = populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, seq_len, kTextModelMethod);
ET_CHECK_OK_OR_RETURN_ERROR(cache_position_result.error());
auto cache_position_tensor = cache_position_result.get();

auto prefill_result = module_->execute(
kTextModelMethod, {encoder_output, cache_position_tensor});
Expand Down Expand Up @@ -217,8 +243,10 @@ ::executorch::runtime::Error MultimodalPrefiller::load() {
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));

std::unordered_set<std::string> methods =
ET_UNWRAP(module_->method_names(), "Failed to get method names");
auto method_names_result = module_->method_names();
ET_CHECK_OK_OR_RETURN_ERROR(
method_names_result.error(), "Failed to get method names");
std::unordered_set<std::string> methods = method_names_result.get();

// Load image_encoder method if exists.
if (methods.find(kVisionEncoderMethod) != methods.end()) {
Expand Down
31 changes: 25 additions & 6 deletions extension/llm/runner/multimodal_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ Error MultimodalRunner::prefill(const std::vector<MultimodalInput>& inputs) {
ET_CHECK_OK_OR_RETURN_ERROR(load());
}
for (auto& input : inputs) {
ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
auto prefill_result = multimodal_prefiller_->prefill(input, pos_);
if (!prefill_result.ok()) {
return prefill_result.error();
}
}
return Error::Ok;
}
Expand Down Expand Up @@ -125,15 +128,27 @@ Error MultimodalRunner::generate(
if (config.echo && i == inputs.size() - 1 && input.is_text()) {
wrapped_callback(input.get_text());
}
prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
auto prefill_result = multimodal_prefiller_->prefill(input, pos_);
if (!prefill_result.ok()) {
return prefill_result.error();
}
prefill_next_token = prefill_result.get();
}

stats_->first_token_ms = time_in_ms();
stats_->prompt_eval_end_ms = time_in_ms();
stats_->num_prompt_tokens = pos_;

wrapped_callback(ET_UNWRAP_TOKENIZER(
tokenizer_->decode(prefill_next_token, prefill_next_token)));
auto decode_result =
tokenizer_->decode(prefill_next_token, prefill_next_token);
if (!decode_result.ok()) {
ET_LOG(
Error,
"Tokenizers error code %d",
static_cast<uint32_t>(decode_result.error()));
return Error::InvalidArgument;
}
wrapped_callback(std::move(*decode_result));

RUNNER_ET_LOG(
config.warming,
Expand All @@ -160,13 +175,17 @@ Error MultimodalRunner::generate(

// Generate tokens using the text token generator
std::vector<uint64_t> prompt_tokens = {prefill_next_token};
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
auto generate_result = text_token_generator_->generate(
/*tokens=*/prompt_tokens,
/*start_pos=*/pos_,
/*max_new_tokens=*/max_new_tokens -
1, // Subtract 1 because prefill already generated 1 token
/*temperature=*/config.temperature,
/*token_callback=*/wrapped_callback));
/*token_callback=*/wrapped_callback);
if (!generate_result.ok()) {
return generate_result.error();
}
int64_t num_generated_tokens = generate_result.get();

pos_ += num_generated_tokens;
// Update stats
Expand Down
14 changes: 11 additions & 3 deletions extension/llm/runner/text_decoder_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,23 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
TensorPtr& tokens,
int64_t start_pos) {
// ET_LOG(Info, "Input token %" PRIu64, input_token);
auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
auto method_meta_result = module_->method_meta("forward");
if (!method_meta_result.ok()) {
return method_meta_result.error();
}
auto method_meta = std::move(*method_meta_result);
// If only 1 input, we are not using kv cache
bool use_kv_cache = method_meta.num_inputs() > 1;

std::vector<int64_t> cache_positions;

if (use_kv_cache) {
auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, tokens->numel(), "forward"));
auto start_pos_tensor_result = populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, tokens->numel(), "forward");
if (!start_pos_tensor_result.ok()) {
return start_pos_tensor_result.error();
}
auto start_pos_tensor = std::move(*start_pos_tensor_result);

std::vector<runtime::EValue> inputs;
auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
Expand Down
19 changes: 15 additions & 4 deletions extension/llm/runner/text_llm_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,15 @@ Error TextLLMRunner::generate(
stats_->prompt_eval_end_ms = time_in_ms();

// print the first token from prefill. No prev_token so use cur_token for it.
wrapped_callback(
ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
auto decode_result = tokenizer_->decode(cur_token, cur_token);
if (!decode_result.ok()) {
ET_LOG(
Error,
"Tokenizers error code %d",
static_cast<uint32_t>(decode_result.error()));
return ::executorch::runtime::Error::InvalidArgument;
}
wrapped_callback(std::move(*decode_result));
RUNNER_ET_LOG(
config.warming,
"RSS after prompt prefill: %f MiB (0 if unsupported)",
Expand All @@ -181,12 +188,16 @@ Error TextLLMRunner::generate(
prompt_tokens.push_back(cur_token);

// Generate max_new_tokens - 1 because prefill already generated 1 token.
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
auto generate_result = text_token_generator_->generate(
prompt_tokens,
num_prompt_tokens,
max_new_tokens - 1,
temperature_ == -1.0f ? config.temperature : temperature_,
wrapped_callback));
wrapped_callback);
if (!generate_result.ok()) {
return generate_result.error();
}
int64_t num_generated_tokens = generate_result.get();

stats_->inference_end_ms = time_in_ms();
if (!config.warming) {
Expand Down
13 changes: 10 additions & 3 deletions extension/llm/runner/text_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(

// run the first token and get back logits tensor. Assuming the first token
// is bos so don't callback.
auto logits_tensor =
ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
auto logits_result = text_decoder_runner_->step(tokens, start_pos);
if (!logits_result.ok()) {
return logits_result.error();
}
auto logits_tensor = std::move(*logits_result);

pos += 1; // start the loop from index 1
start_pos += 1;
Expand All @@ -116,7 +119,11 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
// NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
cur_token = prompt_tokens[pos];

logits_tensor = ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
auto step_result = text_decoder_runner_->step(tokens, start_pos);
if (!step_result.ok()) {
return step_result.error();
}
logits_tensor = std::move(*step_result);

pos++;
start_pos++;
Expand Down
11 changes: 9 additions & 2 deletions extension/llm/runner/text_token_generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,15 @@ class ET_EXPERIMENTAL TextTokenGenerator {
}

// print the token as string, decode it with the Tokenizer object
token_callback(
ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
auto decode_result = tokenizer_->decode(prev_token, cur_token);
if (!decode_result.ok()) {
ET_LOG(
Error,
"Tokenizers error code %d",
static_cast<uint32_t>(decode_result.error()));
return ::executorch::runtime::Error::InvalidArgument;
}
token_callback(std::move(*decode_result));

if (should_stop_) {
break;
Expand Down
12 changes: 10 additions & 2 deletions extension/llm/runner/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,16 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
const char* method_name = "forward") {
// Get expected shape of cache position tensor, which should be the second
// argument
auto method_meta = ET_UNWRAP(module->method_meta(method_name));
auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
auto method_meta_result = module->method_meta(method_name);
if (!method_meta_result.ok()) {
return method_meta_result.error();
}
auto method_meta = std::move(*method_meta_result);
auto second_input_info_result = method_meta.input_tensor_meta(1);
if (!second_input_info_result.ok()) {
return second_input_info_result.error();
}
auto second_input_info = std::move(*second_input_info_result);
auto second_input_sizes = second_input_info.sizes();
auto numel = second_input_sizes[0];

Expand Down
Loading
Loading