diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py index 64df7b7dcb2..96fd2c2404d 100644 --- a/backends/cuda/cuda_partitioner.py +++ b/backends/cuda/cuda_partitioner.py @@ -16,6 +16,7 @@ PartitionResult, ) from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer +from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param from torch.export.exported_program import ExportedProgram @@ -56,6 +57,18 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: tag_constant_data(exported_program) tag_mutated_buffer(exported_program) + # Tag constant placeholders that have no users + # tag_constant_data only tags constants that have users with delegation_tag + # but we need to tag all constants for this partition + for node in exported_program.graph.nodes: + if node.op == "placeholder" and ( + is_param(exported_program, node) + or is_buffer(exported_program, node) + or is_lifted_tensor_constant(exported_program, node) + ): + if "delegation_tag" not in node.meta: + node.meta["delegation_tag"] = tag + return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags ) diff --git a/examples/models/whisper/CMakeLists.txt b/examples/models/whisper/CMakeLists.txt new file mode 100644 index 00000000000..8c8a9cfcaa9 --- /dev/null +++ b/examples/models/whisper/CMakeLists.txt @@ -0,0 +1,97 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.24) +project(whisper_runner) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..") +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +# Let files say "include " +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +# Need this for gflags for some reason +set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) +find_package(gflags REQUIRED) + +list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..) +find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) +executorch_target_link_options_shared_lib(executorch) + +set(link_libraries executorch gflags) +set(_srcs multimodal.cpp) + +list( + APPEND + link_libraries + optimized_native_cpu_ops_lib + quantized_ops_lib + custom_ops + cpublas + eigen_blas +) +executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) +executorch_target_link_options_shared_lib(quantized_ops_lib) +executorch_target_link_options_shared_lib(custom_ops) + +# XNNPACK +if(TARGET xnnpack_backend) + set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod) + if(TARGET kleidiai) + list(APPEND xnnpack_backend_libs kleidiai) + endif() + list(APPEND link_libraries ${xnnpack_backend_libs}) + executorch_target_link_options_shared_lib(xnnpack_backend) +endif() + +# Add LLM runner and extension module +if(NOT TARGET extension_llm_runner) + message( + FATAL_ERROR + "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled." + ) +endif() + +# Needed for cpuinfo where it uses android specific log lib +if(ANDROID) + list(APPEND link_libraries log) +endif() + +# Add the required ExecuTorch extensions for multimodal LLM runner +list( + APPEND + link_libraries + extension_llm_runner + extension_module + extension_data_loader + extension_tensor + extension_flat_tensor +) + +# Link CUDA backend +if(EXECUTORCH_BUILD_CUDA) + find_package(CUDAToolkit REQUIRED) + list(APPEND link_libraries aoti_cuda) + executorch_target_link_options_shared_lib(aoti_cuda) +endif() + +if(EXECUTORCH_BUILD_METAL) + list(APPEND link_libraries metal_backend) + executorch_target_link_options_shared_lib(metal_backend) +endif() + +# Add tokenizers +list(APPEND link_libraries tokenizers::tokenizers) + +add_executable(whisper_runner runner.cpp main.cpp) + +target_include_directories(whisper_runner PUBLIC ${_common_include_directories}) + +target_link_libraries(whisper_runner PUBLIC ${link_libraries}) +target_compile_options(whisper_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/whisper/README.md b/examples/models/whisper/README.md new file mode 100644 index 00000000000..bb8604b7818 --- /dev/null +++ b/examples/models/whisper/README.md @@ -0,0 +1,69 @@ +# Whisper Runner + +This directory hosts a lightweight C++ helper that drives Whisper models +exported to ExecuTorch. The `WhisperRunner` owns the `Module` instance that +wraps a bundled `.pte` program and optional `.ptd` weight file, loads the +`encoder` and `text_decoder` methods, and exposes a `transcribe()` loop that +streams decoded text pieces through a callback. + +The runner assumes: +- `model.pte` contains both Whisper encoder and decoder entry points named + `encoder` and `text_decoder`. +- External parameters (for example KV cache blocks) are stored in a companion + `model.ptd`. +- A tokenizer JSON compatible with the ExecuTorch tokenizers shim is available. + +Audio preprocessing is not part of the runner itself. To transform raw audio +into the mel features expected by the encoder, reuse the pattern in +`examples/models/voxtral/multimodal.cpp`, which loads a `preprocessor.pte` +module to generate the spectrogram tensor. + +## Build + +```bash +cmake -G Ninja \ + -B cmake-out/examples/models/whisper \ + -S examples/models/whisper +cmake --build cmake-out/examples/models/whisper -j +``` + +The build produces a static library named `whisper_runner`. Link it into your +application together with the standard ExecuTorch runtime libraries and the +tokenizer target (`tokenizers::tokenizers`). + +## Usage + +```cpp +#include +#include + +using example::WhisperRunner; +using example::WhisperTranscribeConfig; + +WhisperRunner runner("model.pte", "model.ptd", "tokenizer.json"); +ET_CHECK_OK(runner.load()); + +// `features` is the mel spectrogram tensor produced by the preprocessor. +executorch::aten::Tensor features = load_features_somehow(); + +WhisperTranscribeConfig config; +config.max_new_tokens = 128; // stop after 128 generated tokens +config.temperature = 0.7f; // optional: enable stochastic sampling + +auto tokens_result = runner.transcribe( + features, + config, + [](const std::string& piece) { + std::cout << piece; + }); + +if (!tokens_result.ok()) { + ET_LOG(Error, "Transcription failed: %d", static_cast(tokens_result.error())); +} +``` + +`transcribe()` returns the full token history (prompt + generated tokens) and +invokes the callback every time a new token is emitted. Provide a non-empty +`decoder_input_ids` vector if you want to seed the decoder with a custom prompt, +and override `WhisperTranscribeConfig::eos_token_ids` when the model exposes +custom termination ids. diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp new file mode 100644 index 00000000000..76d35a19c43 --- /dev/null +++ b/examples/models/whisper/main.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if defined(__has_include) +#if __has_include() +#define ET_GEMMA3_HAS_CUDA_RUNTIME 1 +#include +#else +#define ET_GEMMA3_HAS_CUDA_RUNTIME 0 +#endif +#else +#define ET_GEMMA3_HAS_CUDA_RUNTIME 0 +#endif + +#include +#include +#include +#include +#include +#include +#include + +DEFINE_string(model_path, "model.pte", "Path to Whisper model (.pte)."); +DEFINE_string(data_path, "", "Optional path to Whisper weights (.ptd)."); +DEFINE_string( + tokenizer_path, + ".", + "Path to tokenizer directory containing tokenizer.json, tokenizer_config.json, and special_tokens_map.json."); +DEFINE_string( + preprocessor_path, + "", + "Path to preprocessor .pte for converting raw audio."); +DEFINE_string( + audio_path, + "", + "Path to input audio file. Accepts .wav or raw float .bin."); +DEFINE_double( + temperature, + 0.0, + "Sampling temperature. 0.0 performs greedy decoding."); +DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate."); + +namespace { + +using ::executorch::extension::from_blob; +using ::executorch::extension::Module; + +#if ET_GEMMA3_HAS_CUDA_RUNTIME +class CudaMemoryTracker { + public: + CudaMemoryTracker() { + if (!query(&last_free_bytes_, &total_bytes_)) { + return; + } + available_ = true; + min_free_bytes_ = last_free_bytes_; + log_state("startup", last_free_bytes_, total_bytes_); + } + + void log_sample(const char* tag) { + if (!available_) { + return; + } + size_t free_bytes = 0; + size_t total_bytes = 0; + if (!query(&free_bytes, &total_bytes)) { + return; + } + min_free_bytes_ = std::min(min_free_bytes_, free_bytes); + total_bytes_ = total_bytes; + last_free_bytes_ = free_bytes; + log_state(tag, free_bytes, total_bytes); + } + + ~CudaMemoryTracker() { + if (!available_) { + return; + } + size_t free_bytes = 0; + size_t total_bytes = 0; + if (!query(&free_bytes, &total_bytes)) { + return; + } + min_free_bytes_ = std::min(min_free_bytes_, free_bytes); + total_bytes_ = total_bytes; + last_free_bytes_ = free_bytes; + const double peak_mb = + static_cast(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0); + const double total_mb = + static_cast(total_bytes_) / (1024.0 * 1024.0); + std::cout << "CUDA memory peak usage: " << peak_mb + << " MB, total: " << total_mb << " MB" << std::endl; + } + + private: + bool query(size_t* free_bytes, size_t* total_bytes) { + cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes); + if (err != cudaSuccess) { + if (!error_logged_) { + error_logged_ = true; + std::cerr << "Warning: cudaMemGetInfo failed with error: " + << cudaGetErrorString(err) << std::endl; + } + available_ = false; + return false; + } + return true; + } + + void log_state(const char* tag, size_t free_bytes, size_t total_bytes) const { + const double used_mb = + static_cast(total_bytes - free_bytes) / (1024.0 * 1024.0); + const double free_mb = static_cast(free_bytes) / (1024.0 * 1024.0); + const double total_mb = + static_cast(total_bytes) / (1024.0 * 1024.0); + std::cout << "CUDA memory (" << tag << "): used " << used_mb << " MB, free " + << free_mb << " MB, total " << total_mb << " MB" << std::endl; + } + + bool available_{false}; + bool error_logged_{false}; + size_t last_free_bytes_{0}; + size_t total_bytes_{0}; + size_t min_free_bytes_{std::numeric_limits::max()}; +}; +#else +class CudaMemoryTracker { + public: + CudaMemoryTracker() = default; + void log_sample(const char* tag) { + (void)tag; + } +}; +#endif + +} // namespace + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + CudaMemoryTracker cuda_memory_tracker; + ::executorch::extension::TensorPtr features; + std::vector audio_data; + std::unique_ptr processor; + + if (FLAGS_audio_path.empty()) { + ET_LOG(Error, "audio_path flag must be provided."); + return 1; + } + + audio_data = + executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path); + ET_LOG( + Info, + "First 2 values of audio data: %f, %f", + audio_data[0], + audio_data[1]); + + processor = + std::make_unique(FLAGS_preprocessor_path, Module::LoadMode::Mmap); + auto load_error = processor->load(); + if (load_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to load preprocessor module."); + return 1; + } + + auto audio_tensor = from_blob( + audio_data.data(), + {static_cast<::executorch::aten::SizesType>(audio_data.size())}, + ::executorch::aten::ScalarType::Float); + + auto processed_result = processor->execute("forward", audio_tensor); + if (processed_result.error() != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Audio preprocessing failed."); + return 1; + } + auto outputs = std::move(processed_result.get()); + if (outputs.empty() || !outputs[0].isTensor()) { + ET_LOG(Error, "Preprocessor returned unexpected outputs."); + return 1; + } + auto tensor = outputs[0].toTensor(); + ET_LOG( + Info, + "Result scalar_type: %s, first value %f", + ::executorch::runtime::toString(tensor.scalar_type()), + tensor.mutable_data_ptr()[0]); + features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor)); + + example::WhisperRunner runner( + FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path); + auto load_err = runner.load(); + if (load_err != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to load Whisper model."); + return 1; + } + cuda_memory_tracker.log_sample("post-runner-load"); + + example::WhisperTranscribeConfig config; + config.max_new_tokens = FLAGS_max_new_tokens; + config.temperature = static_cast(FLAGS_temperature); + + std::string transcript; + auto result = + runner.transcribe(features, config, [&](const std::string& piece) { + ::executorch::extension::llm::safe_printf(piece.c_str()); + fflush(stdout); + }); + cuda_memory_tracker.log_sample("post-transcribe"); + + if (!result.ok()) { + ET_LOG(Error, "Transcription failed."); + return 1; + } + + return 0; +} diff --git a/examples/models/whisper/runner.cpp b/examples/models/whisper/runner.cpp new file mode 100644 index 00000000000..c1caa0a6349 --- /dev/null +++ b/examples/models/whisper/runner.cpp @@ -0,0 +1,323 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace example { +namespace { + +constexpr const char* kEncoderMethodName = "encoder"; +constexpr const char* kDecoderMethodName = "text_decoder"; +constexpr int64_t kDecoderStartTokenId = 50257; + +} // namespace + +WhisperRunner::WhisperRunner( + std::string module_path, + std::string data_path, + std::string tokenizer_path) + : module_path_(std::move(module_path)), + data_path_(std::move(data_path)), + tokenizer_path_(std::move(tokenizer_path)) { + if (data_path_.empty()) { + module_ = std::make_unique(module_path_, Module::LoadMode::Mmap); + } else { + module_ = std::make_unique( + module_path_, data_path_, Module::LoadMode::Mmap); + } +} + +bool WhisperRunner::is_loaded() const { + return module_ && encoder_method_loaded_ && decoder_method_loaded_ && + tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty(); +} + +Error WhisperRunner::ensure_tokenizer() { + if (tokenizer_ && tokenizer_->is_loaded()) { + return Error::Ok; + } + + auto tokenizer = load_tokenizer(tokenizer_path_); + if (!tokenizer) { + ET_LOG( + Error, "Failed to create tokenizer from %s", tokenizer_path_.c_str()); + return Error::Internal; + } + + tokenizer_ = std::move(tokenizer); + if (!tokenizer_->is_loaded()) { + ET_LOG( + Error, + "Tokenizer reported unloaded state after load: %s", + tokenizer_path_.c_str()); + return Error::Internal; + } + + eos_token_ids_.clear(); + eos_token_ids_.insert(static_cast(tokenizer_->eos_tok())); + return Error::Ok; +} + +const std::unordered_set& WhisperRunner::eos_token_ids() const { + return eos_token_ids_; +} + +Error WhisperRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + + stats_.model_load_start_ms = ::executorch::extension::llm::time_in_ms(); + + ET_CHECK_OR_RETURN_ERROR( + module_ != nullptr, + InvalidArgument, + "Module handle is null for path %s", + module_path_.c_str()); + + ET_CHECK_OK_OR_RETURN_ERROR(module_->load()); + + auto method_names_result = module_->method_names(); + ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error()); + const auto& method_names = method_names_result.get(); + + ET_CHECK_OR_RETURN_ERROR( + method_names.count(kEncoderMethodName) && + method_names.count(kDecoderMethodName), + InvalidArgument, + "Required methods not found. encoder=%d, text_decoder=%d", + static_cast(method_names.count(kEncoderMethodName)), + static_cast(method_names.count(kDecoderMethodName))); + + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kEncoderMethodName)); + encoder_method_loaded_ = true; + + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName)); + decoder_method_loaded_ = true; + + ET_CHECK_OK_OR_RETURN_ERROR(ensure_tokenizer()); + auto eos_ids = get_eos_ids(tokenizer_.get(), module_.get()); + if (!eos_ids.empty()) { + eos_token_ids_.clear(); + for (uint64_t eos_id : eos_ids) { + eos_token_ids_.insert(static_cast(eos_id)); + } + } + + stats_.model_load_end_ms = ::executorch::extension::llm::time_in_ms(); + + return Error::Ok; +} + +Result> WhisperRunner::transcribe( + ::executorch::extension::TensorPtr preprocessed_features, + WhisperTranscribeConfig config, + std::function token_callback) { + ET_CHECK_OR_RETURN_ERROR( + config.max_new_tokens > 0, + InvalidArgument, + "max_new_tokens must be positive, got %" PRId64, + config.max_new_tokens); + + ET_LOG( + Info, + "Preprocessed features shape: [%zu, %zu, %zu]", + static_cast(preprocessed_features->size(0)), + static_cast(preprocessed_features->size(1)), + static_cast(preprocessed_features->size(2))); + + if (!is_loaded()) { + ET_CHECK_OK_OR_RETURN_ERROR(load()); + } + + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + ::executorch::extension::llm::get_rss_bytes() / 1024.0 / 1024.0); + + // Reset internal state and start inference + stats_.inference_start_ms = ::executorch::extension::llm::time_in_ms(); + + const std::unordered_set* eos_tokens = &eos_token_ids(); + if (!config.eos_token_ids.empty()) { + eos_tokens = &config.eos_token_ids; + } + ET_CHECK_OR_RETURN_ERROR( + !eos_tokens->empty(), + InvalidArgument, + "EOS token set must not be empty."); + ::executorch::extension::llm::Sampler sampler( + tokenizer_->vocab_size(), config.temperature); + + // Check expected dtype for encoder input + auto encoder_method_meta_result = module_->method_meta(kEncoderMethodName); + ET_CHECK_OK_OR_RETURN_ERROR(encoder_method_meta_result.error()); + auto encoder_method_meta = encoder_method_meta_result.get(); + + ::executorch::aten::ScalarType expected_dtype = + ::executorch::aten::ScalarType::Float; + if (encoder_method_meta.num_inputs() > 0) { + auto input_meta_result = encoder_method_meta.input_tensor_meta(0); + if (input_meta_result.error() == ::executorch::runtime::Error::Ok) { + expected_dtype = input_meta_result.get().scalar_type(); + } + } + + // Convert preprocessed_features to expected dtype if needed + if (preprocessed_features->scalar_type() != expected_dtype) { + if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) { + ET_LOG( + Info, + "Converting audio features from %s to BFloat16. Before converting, first value = %f", + ::executorch::runtime::toString(preprocessed_features->scalar_type()), + preprocessed_features->mutable_data_ptr()[0]); + auto convert_result = ::executorch::extension::llm::convert_to_bfloat16( + preprocessed_features); + ET_CHECK_OK_OR_RETURN_ERROR(convert_result.error()); + preprocessed_features = convert_result.get(); + ET_LOG( + Info, + "Conversion complete, first value = %f", + static_cast( + preprocessed_features + ->mutable_data_ptr<::executorch::aten::BFloat16>()[0])); + } + } + + auto encoder_result = + module_->execute(kEncoderMethodName, preprocessed_features); + ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error()); + + stats_.prompt_eval_end_ms = ::executorch::extension::llm::time_in_ms(); + stats_.num_prompt_tokens = 0; + + auto encoder_outputs = std::move(*encoder_result); + ET_CHECK_OR_RETURN_ERROR( + encoder_outputs.size() == 1 && encoder_outputs[0].isTensor(), + Internal, + "Encoder returned %zu outputs; expected a single tensor.", + encoder_outputs.size()); + + ::executorch::aten::Tensor encoder_output_tensor = + std::move(encoder_outputs[0]).toTensor(); + + ET_LOG( + Info, + "Encoder output shape: [%zu, %zu, %zu]", + static_cast(encoder_output_tensor.size(0)), + static_cast(encoder_output_tensor.size(1)), + static_cast(encoder_output_tensor.size(2))); + ET_LOG( + Info, + "Encoder first value: %f", + static_cast( + encoder_output_tensor + .mutable_data_ptr<::executorch::aten::BFloat16>()[0])); + + auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>( + std::move(encoder_output_tensor)); + + std::vector tokens = {kDecoderStartTokenId}; + + int64_t input_id = kDecoderStartTokenId; + int64_t cache_position = 0; + int64_t generated_tokens = 0; + bool first_token_generated = false; + auto decoder_input_ptr = ::executorch::extension::from_blob( + &input_id, + {static_cast<::executorch::aten::SizesType>(1), + static_cast<::executorch::aten::SizesType>(1)}, + ::executorch::aten::ScalarType::Long); + + auto cache_position_ptr = ::executorch::extension::from_blob( + &cache_position, + {static_cast<::executorch::aten::SizesType>(1)}, + ::executorch::aten::ScalarType::Long); + + std::vector<::executorch::runtime::EValue> decoder_inputs; + decoder_inputs.reserve(3); + decoder_inputs.emplace_back(decoder_input_ptr); + decoder_inputs.emplace_back(encoder_output_ptr); + decoder_inputs.emplace_back(cache_position_ptr); + // Add some green coloring for the first generated token + token_callback("\033[1;32m"); + while (generated_tokens < config.max_new_tokens) { + input_id = tokens.back(); + auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs); + ET_CHECK_OK_OR_RETURN_ERROR(decoder_result.error()); + + auto decoder_outputs = std::move(*decoder_result); + ET_CHECK_OR_RETURN_ERROR( + decoder_outputs.size() == 1 && decoder_outputs[0].isTensor(), + Internal, + "Decoder returned %zu outputs; expected a single tensor.", + decoder_outputs.size()); + + ::executorch::aten::Tensor logits_tensor = + std::move(decoder_outputs[0]).toTensor(); + const int64_t vocab_size = logits_tensor.numel(); + ET_CHECK_OR_RETURN_ERROR( + vocab_size > 0, Internal, "Decoder logits tensor is empty."); + + const int64_t next_token = static_cast( + logits_to_token(logits_tensor, config.temperature)); + + if (!first_token_generated) { + stats_.first_token_ms = ::executorch::extension::llm::time_in_ms(); + first_token_generated = true; + } + + const int64_t prev_token = input_id; + tokens.push_back(next_token); + ++generated_tokens; + ++cache_position; + input_id = next_token; + + if (token_callback) { + auto piece_result = tokenizer_->decode( + static_cast(prev_token), static_cast(next_token)); + if (piece_result.ok()) { + token_callback(piece_result.get()); + } else { + ET_LOG( + Error, + "Tokenizer failed to decode token pair (%" PRId64 ", %" PRId64 + ") with error %d", + prev_token, + next_token, + static_cast(piece_result.error())); + } + } + + if (eos_tokens->count(next_token) > 0) { + break; + } + } + // Reset coloring + token_callback("\033[0m"); + // Update stats and print report + stats_.num_generated_tokens = generated_tokens; + stats_.inference_end_ms = ::executorch::extension::llm::time_in_ms(); + printf("\n"); + print_report(stats_); + + return tokens; +} + +} // namespace example diff --git a/examples/models/whisper/runner.h b/examples/models/whisper/runner.h new file mode 100644 index 00000000000..c578e42601e --- /dev/null +++ b/examples/models/whisper/runner.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +using ::executorch::extension::Module; +using ::executorch::extension::llm::get_eos_ids; +using ::executorch::extension::llm::load_tokenizer; +using ::executorch::extension::llm::print_report; +using ::executorch::extension::llm::Sampler; +using ::executorch::extension::llm::Stats; +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +/** + * Configuration for the Whisper transcription loop. + * + * max_new_tokens controls the number of tokens generated after the prompt. + * Temperature controls the randomness of the output. + */ +struct WhisperTranscribeConfig { + int64_t max_new_tokens = 128; + std::unordered_set eos_token_ids = {}; + float temperature = 0.0f; +}; + +/** + * Runner that owns a Whisper encoder + decoder pair exported as a single + * ExecuTorch module. + * + * The module is expected to expose two callable methods: + * - "encoder": processes precomputed audio features into encoder states. + * - "text_decoder": consumes the decoder input ids, encoder output and cache + * positions to autoregressively generate logits. + */ +class WhisperRunner { + public: + WhisperRunner( + std::string module_path, + std::string data_path, + std::string tokenizer_path); + + /** + * Returns true when the module and tokenizer are ready for inference. + */ + bool is_loaded() const; + + /** + * Loads the module, validates required methods and initialises tokenizer. + */ + ::executorch::runtime::Error load(); + + /** + * Executes an end-to-end transcription cycle. + * + * @param preprocessed_features Audio features already processed by a + * preprocessor module (see voxtral example). + * @param config Controls generation length and termination criteria. + * @param token_callback Optional functor invoked for each decoded piece of + * text emitted during generation. + * + * @returns Result containing the final decoder token ids (including the seed + * prompt and generated tokens), or an error. + */ + ::executorch::runtime::Result> transcribe( + ::executorch::extension::TensorPtr preprocessed_features, + WhisperTranscribeConfig config = {}, + std::function token_callback = {}); + + private: + ::executorch::runtime::Error ensure_tokenizer(); + const std::unordered_set& eos_token_ids() const; + + /** + * Sample the next token from the logits tensor. + * @param logits_tensor The logits tensor. + * @param temperature The temperature parameter used to control randomness in + * sampling. + * @return The next token. + */ + inline int32_t logits_to_token( + const executorch::aten::Tensor& logits_tensor, + const float temperature = 0.0f) { + int32_t result = 0; + + // Create a minimal context for error handling in ET_SWITCH + struct { + [[noreturn]] void fail(torch::executor::Error /* error */) { + ET_CHECK_MSG(false, "Unsupported dtype in logits_to_token"); + } + } ctx; + + ET_SWITCH_FOUR_TYPES( + Float, + Half, + BFloat16, + UInt16, + logits_tensor.scalar_type(), + ctx, + "logits_to_token", + CTYPE, + [&]() { + // If the logit_tensor rank is 3, the shape is [batch, seq_length, + // vocab_size], get the last logits, sample and return. Else the model + // outputs the last logit, directly sample and return. + auto* logits = logits_tensor.mutable_data_ptr(); + ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1); + if (logits_tensor.dim() == 3) { + auto num_tokens = logits_tensor.size(1); + logits += (num_tokens - 1) * vocab_size; + } + // @lint-ignore CLANGTIDY facebook-hte-Deprecated + Sampler sampler(vocab_size, temperature); + result = sampler.sample(logits); + }); + return result; + } + + std::string module_path_; + std::string data_path_; + std::string tokenizer_path_; + + std::unique_ptr module_; + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + std::unordered_set eos_token_ids_; + + bool encoder_method_loaded_ = false; + bool decoder_method_loaded_ = false; + + Stats stats_; +}; + +} // namespace example diff --git a/extension/llm/runner/wav_loader.h b/extension/llm/runner/wav_loader.h index f49a4d1723e..eba37947fff 100644 --- a/extension/llm/runner/wav_loader.h +++ b/extension/llm/runner/wav_loader.h @@ -168,18 +168,29 @@ inline std::vector load_wav_audio_data(const std::string& fp) { size_t data_offset = header->dataOffset; size_t data_size = header->Subchunk2Size; int bits_per_sample = header->bitsPerSample; + int audio_format = header->AudioFormat; std::vector audio_data; if (bits_per_sample == 32) { size_t num_samples = data_size / 4; audio_data.resize(num_samples); - const int32_t* input_buffer = - reinterpret_cast(data + data_offset); - for (size_t i = 0; i < num_samples; ++i) { - audio_data[i] = static_cast( - static_cast(input_buffer[i]) * kOneOverIntMax); + if (audio_format == 3) { + // IEEE float format - read directly as floats + const float* input_buffer = + reinterpret_cast(data + data_offset); + for (size_t i = 0; i < num_samples; ++i) { + audio_data[i] = input_buffer[i]; + } + } else { + // PCM integer format - normalize from int32 + const int32_t* input_buffer = + reinterpret_cast(data + data_offset); + for (size_t i = 0; i < num_samples; ++i) { + audio_data[i] = static_cast( + static_cast(input_buffer[i]) * kOneOverIntMax); + } } } else if (bits_per_sample == 16) { size_t num_samples = data_size / 2; diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp index dab1a8ab176..9748a36d6e8 100644 --- a/extension/tensor/tensor_ptr.cpp +++ b/extension/tensor/tensor_ptr.cpp @@ -79,27 +79,27 @@ TensorPtr make_tensor_ptr( }); } } - std::vector computed_strides(dim); - - auto error = runtime::dim_order_to_stride( - sizes.data(), dim_order.data(), dim, computed_strides.data()); - ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides."); - - if (!strides.empty()) { - for (size_t i = 0; i < dim; i++) { - ET_CHECK_MSG( - strides[i] == computed_strides[i] || sizes[i] == 1, - "invalid strides for dim %zu: %" ET_PRI_SIZES_AND_STRIDES - "!= %" ET_PRI_SIZES_AND_STRIDES - " while its size is %" ET_PRI_SIZES_AND_STRIDES " != 1", - i, - strides[i], - computed_strides[i], - sizes[i]); - } - } - - strides = std::move(computed_strides); + // std::vector computed_strides(dim); + + // auto error = runtime::dim_order_to_stride( + // sizes.data(), dim_order.data(), dim, computed_strides.data()); + // ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides."); + + // if (!strides.empty()) { + // for (size_t i = 0; i < dim; i++) { + // ET_CHECK_MSG( + // strides[i] == computed_strides[i] || sizes[i] == 1, + // "invalid strides for dim %zu: %" ET_PRI_SIZES_AND_STRIDES + // "!= %" ET_PRI_SIZES_AND_STRIDES + // " while its size is %" ET_PRI_SIZES_AND_STRIDES " != 1", + // i, + // strides[i], + // computed_strides[i], + // sizes[i]); + // } + // } + + // strides = std::move(computed_strides); #ifndef USE_ATEN_LIB executorch::aten::TensorImpl tensor_impl(