diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index 64df7b7dcb2..96fd2c2404d 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -16,6 +16,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
 
 
@@ -56,6 +57,18 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
+        # Tag constant placeholders that have no users
+        # tag_constant_data only tags constants that have users with delegation_tag
+        # but we need to tag all constants for this partition
+        for node in exported_program.graph.nodes:
+            if node.op == "placeholder" and (
+                is_param(exported_program, node)
+                or is_buffer(exported_program, node)
+                or is_lifted_tensor_constant(exported_program, node)
+            ):
+                if "delegation_tag" not in node.meta:
+                    node.meta["delegation_tag"] = tag
+
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
diff --git a/examples/models/whisper/CMakeLists.txt b/examples/models/whisper/CMakeLists.txt
new file mode 100644
index 00000000000..8c8a9cfcaa9
--- /dev/null
+++ b/examples/models/whisper/CMakeLists.txt
@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.24)
+project(whisper_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+set(_srcs multimodal.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Add LLM runner and extension module
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# Add the required ExecuTorch extensions for multimodal LLM runner
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
+if(EXECUTORCH_BUILD_METAL)
+  list(APPEND link_libraries metal_backend)
+  executorch_target_link_options_shared_lib(metal_backend)
+endif()
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(whisper_runner runner.cpp main.cpp)
+
+target_include_directories(whisper_runner PUBLIC ${_common_include_directories})
+
+target_link_libraries(whisper_runner PUBLIC ${link_libraries})
+target_compile_options(whisper_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/whisper/README.md b/examples/models/whisper/README.md
new file mode 100644
index 00000000000..bb8604b7818
--- /dev/null
+++ b/examples/models/whisper/README.md
@@ -0,0 +1,69 @@
+# Whisper Runner
+
+This directory hosts a lightweight C++ helper that drives Whisper models
+exported to ExecuTorch. The `WhisperRunner` owns the `Module` instance that
+wraps a bundled `.pte` program and optional `.ptd` weight file, loads the
+`encoder` and `text_decoder` methods, and exposes a `transcribe()` loop that
+streams decoded text pieces through a callback.
+
+The runner assumes:
+- `model.pte` contains both Whisper encoder and decoder entry points named
+  `encoder` and `text_decoder`.
+- External parameters (for example KV cache blocks) are stored in a companion
+  `model.ptd`.
+- A tokenizer JSON compatible with the ExecuTorch tokenizers shim is available.
+
+Audio preprocessing is not part of the runner itself. To transform raw audio
+into the mel features expected by the encoder, reuse the pattern in
+`examples/models/voxtral/multimodal.cpp`, which loads a `preprocessor.pte`
+module to generate the spectrogram tensor.
+
+## Build
+
+```bash
+cmake -G Ninja \
+  -B cmake-out/examples/models/whisper \
+  -S examples/models/whisper
+cmake --build cmake-out/examples/models/whisper -j
+```
+
+The build produces a static library named `whisper_runner`. Link it into your
+application together with the standard ExecuTorch runtime libraries and the
+tokenizer target (`tokenizers::tokenizers`).
+
+## Usage
+
+```cpp
+#include <executorch/examples/models/whisper/runner.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+using example::WhisperRunner;
+using example::WhisperTranscribeConfig;
+
+WhisperRunner runner("model.pte", "model.ptd", "tokenizer.json");
+ET_CHECK_OK(runner.load());
+
+// `features` is the mel spectrogram tensor produced by the preprocessor.
+executorch::aten::Tensor features = load_features_somehow();
+
+WhisperTranscribeConfig config;
+config.max_new_tokens = 128; // stop after 128 generated tokens
+config.temperature = 0.7f;  // optional: enable stochastic sampling
+
+auto tokens_result = runner.transcribe(
+    features,
+    config,
+    [](const std::string& piece) {
+      std::cout << piece;
+    });
+
+if (!tokens_result.ok()) {
+  ET_LOG(Error, "Transcription failed: %d", static_cast<int>(tokens_result.error()));
+}
+```
+
+`transcribe()` returns the full token history (prompt + generated tokens) and
+invokes the callback every time a new token is emitted. Provide a non-empty
+`decoder_input_ids` vector if you want to seed the decoder with a custom prompt,
+and override `WhisperTranscribeConfig::eos_token_ids` when the model exposes
+custom termination ids.
diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp
new file mode 100644
index 00000000000..76d35a19c43
--- /dev/null
+++ b/examples/models/whisper/main.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gflags/gflags.h>
+
+#if defined(__has_include)
+#if __has_include(<cuda_runtime_api.h>)
+#define ET_GEMMA3_HAS_CUDA_RUNTIME 1
+#include <cuda_runtime_api.h>
+#else
+#define ET_GEMMA3_HAS_CUDA_RUNTIME 0
+#endif
+#else
+#define ET_GEMMA3_HAS_CUDA_RUNTIME 0
+#endif
+
+#include <executorch/examples/models/whisper/runner.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/llm/runner/wav_loader.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/platform/log.h>
+
+DEFINE_string(model_path, "model.pte", "Path to Whisper model (.pte).");
+DEFINE_string(data_path, "", "Optional path to Whisper weights (.ptd).");
+DEFINE_string(
+    tokenizer_path,
+    ".",
+    "Path to tokenizer directory containing tokenizer.json, tokenizer_config.json, and special_tokens_map.json.");
+DEFINE_string(
+    preprocessor_path,
+    "",
+    "Path to preprocessor .pte for converting raw audio.");
+DEFINE_string(
+    audio_path,
+    "",
+    "Path to input audio file. Accepts .wav or raw float .bin.");
+DEFINE_double(
+    temperature,
+    0.0,
+    "Sampling temperature. 0.0 performs greedy decoding.");
+DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate.");
+
+namespace {
+
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
+
+#if ET_GEMMA3_HAS_CUDA_RUNTIME
+class CudaMemoryTracker {
+ public:
+  CudaMemoryTracker() {
+    if (!query(&last_free_bytes_, &total_bytes_)) {
+      return;
+    }
+    available_ = true;
+    min_free_bytes_ = last_free_bytes_;
+    log_state("startup", last_free_bytes_, total_bytes_);
+  }
+
+  void log_sample(const char* tag) {
+    if (!available_) {
+      return;
+    }
+    size_t free_bytes = 0;
+    size_t total_bytes = 0;
+    if (!query(&free_bytes, &total_bytes)) {
+      return;
+    }
+    min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
+    total_bytes_ = total_bytes;
+    last_free_bytes_ = free_bytes;
+    log_state(tag, free_bytes, total_bytes);
+  }
+
+  ~CudaMemoryTracker() {
+    if (!available_) {
+      return;
+    }
+    size_t free_bytes = 0;
+    size_t total_bytes = 0;
+    if (!query(&free_bytes, &total_bytes)) {
+      return;
+    }
+    min_free_bytes_ = std::min(min_free_bytes_, free_bytes);
+    total_bytes_ = total_bytes;
+    last_free_bytes_ = free_bytes;
+    const double peak_mb =
+        static_cast<double>(total_bytes_ - min_free_bytes_) / (1024.0 * 1024.0);
+    const double total_mb =
+        static_cast<double>(total_bytes_) / (1024.0 * 1024.0);
+    std::cout << "CUDA memory peak usage: " << peak_mb
+              << " MB, total: " << total_mb << " MB" << std::endl;
+  }
+
+ private:
+  bool query(size_t* free_bytes, size_t* total_bytes) {
+    cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes);
+    if (err != cudaSuccess) {
+      if (!error_logged_) {
+        error_logged_ = true;
+        std::cerr << "Warning: cudaMemGetInfo failed with error: "
+                  << cudaGetErrorString(err) << std::endl;
+      }
+      available_ = false;
+      return false;
+    }
+    return true;
+  }
+
+  void log_state(const char* tag, size_t free_bytes, size_t total_bytes) const {
+    const double used_mb =
+        static_cast<double>(total_bytes - free_bytes) / (1024.0 * 1024.0);
+    const double free_mb = static_cast<double>(free_bytes) / (1024.0 * 1024.0);
+    const double total_mb =
+        static_cast<double>(total_bytes) / (1024.0 * 1024.0);
+    std::cout << "CUDA memory (" << tag << "): used " << used_mb << " MB, free "
+              << free_mb << " MB, total " << total_mb << " MB" << std::endl;
+  }
+
+  bool available_{false};
+  bool error_logged_{false};
+  size_t last_free_bytes_{0};
+  size_t total_bytes_{0};
+  size_t min_free_bytes_{std::numeric_limits<size_t>::max()};
+};
+#else
+class CudaMemoryTracker {
+ public:
+  CudaMemoryTracker() = default;
+  void log_sample(const char* tag) {
+    (void)tag;
+  }
+};
+#endif
+
+} // namespace
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  CudaMemoryTracker cuda_memory_tracker;
+  ::executorch::extension::TensorPtr features;
+  std::vector<float> audio_data;
+  std::unique_ptr<Module> processor;
+
+  if (FLAGS_audio_path.empty()) {
+    ET_LOG(Error, "audio_path flag must be provided.");
+    return 1;
+  }
+
+  audio_data =
+      executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
+  ET_LOG(
+      Info,
+      "First 2 values of audio data: %f, %f",
+      audio_data[0],
+      audio_data[1]);
+
+  processor =
+      std::make_unique<Module>(FLAGS_preprocessor_path, Module::LoadMode::Mmap);
+  auto load_error = processor->load();
+  if (load_error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load preprocessor module.");
+    return 1;
+  }
+
+  auto audio_tensor = from_blob(
+      audio_data.data(),
+      {static_cast<::executorch::aten::SizesType>(audio_data.size())},
+      ::executorch::aten::ScalarType::Float);
+
+  auto processed_result = processor->execute("forward", audio_tensor);
+  if (processed_result.error() != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Audio preprocessing failed.");
+    return 1;
+  }
+  auto outputs = std::move(processed_result.get());
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    ET_LOG(Error, "Preprocessor returned unexpected outputs.");
+    return 1;
+  }
+  auto tensor = outputs[0].toTensor();
+  ET_LOG(
+      Info,
+      "Result scalar_type: %s, first value %f",
+      ::executorch::runtime::toString(tensor.scalar_type()),
+      tensor.mutable_data_ptr<float>()[0]);
+  features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor));
+
+  example::WhisperRunner runner(
+      FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path);
+  auto load_err = runner.load();
+  if (load_err != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load Whisper model.");
+    return 1;
+  }
+  cuda_memory_tracker.log_sample("post-runner-load");
+
+  example::WhisperTranscribeConfig config;
+  config.max_new_tokens = FLAGS_max_new_tokens;
+  config.temperature = static_cast<float>(FLAGS_temperature);
+
+  std::string transcript;
+  auto result =
+      runner.transcribe(features, config, [&](const std::string& piece) {
+        ::executorch::extension::llm::safe_printf(piece.c_str());
+        fflush(stdout);
+      });
+  cuda_memory_tracker.log_sample("post-transcribe");
+
+  if (!result.ok()) {
+    ET_LOG(Error, "Transcription failed.");
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/examples/models/whisper/runner.cpp b/examples/models/whisper/runner.cpp
new file mode 100644
index 00000000000..c1caa0a6349
--- /dev/null
+++ b/examples/models/whisper/runner.cpp
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/whisper/runner.h>
+
+#include <inttypes.h>
+#include <algorithm>
+#include <optional>
+
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace example {
+namespace {
+
+constexpr const char* kEncoderMethodName = "encoder";
+constexpr const char* kDecoderMethodName = "text_decoder";
+constexpr int64_t kDecoderStartTokenId = 50257;
+
+} // namespace
+
+WhisperRunner::WhisperRunner(
+    std::string module_path,
+    std::string data_path,
+    std::string tokenizer_path)
+    : module_path_(std::move(module_path)),
+      data_path_(std::move(data_path)),
+      tokenizer_path_(std::move(tokenizer_path)) {
+  if (data_path_.empty()) {
+    module_ = std::make_unique<Module>(module_path_, Module::LoadMode::Mmap);
+  } else {
+    module_ = std::make_unique<Module>(
+        module_path_, data_path_, Module::LoadMode::Mmap);
+  }
+}
+
+bool WhisperRunner::is_loaded() const {
+  return module_ && encoder_method_loaded_ && decoder_method_loaded_ &&
+      tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty();
+}
+
+Error WhisperRunner::ensure_tokenizer() {
+  if (tokenizer_ && tokenizer_->is_loaded()) {
+    return Error::Ok;
+  }
+
+  auto tokenizer = load_tokenizer(tokenizer_path_);
+  if (!tokenizer) {
+    ET_LOG(
+        Error, "Failed to create tokenizer from %s", tokenizer_path_.c_str());
+    return Error::Internal;
+  }
+
+  tokenizer_ = std::move(tokenizer);
+  if (!tokenizer_->is_loaded()) {
+    ET_LOG(
+        Error,
+        "Tokenizer reported unloaded state after load: %s",
+        tokenizer_path_.c_str());
+    return Error::Internal;
+  }
+
+  eos_token_ids_.clear();
+  eos_token_ids_.insert(static_cast<int64_t>(tokenizer_->eos_tok()));
+  return Error::Ok;
+}
+
+const std::unordered_set<int64_t>& WhisperRunner::eos_token_ids() const {
+  return eos_token_ids_;
+}
+
+Error WhisperRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  stats_.model_load_start_ms = ::executorch::extension::llm::time_in_ms();
+
+  ET_CHECK_OR_RETURN_ERROR(
+      module_ != nullptr,
+      InvalidArgument,
+      "Module handle is null for path %s",
+      module_path_.c_str());
+
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load());
+
+  auto method_names_result = module_->method_names();
+  ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error());
+  const auto& method_names = method_names_result.get();
+
+  ET_CHECK_OR_RETURN_ERROR(
+      method_names.count(kEncoderMethodName) &&
+          method_names.count(kDecoderMethodName),
+      InvalidArgument,
+      "Required methods not found. encoder=%d, text_decoder=%d",
+      static_cast<int>(method_names.count(kEncoderMethodName)),
+      static_cast<int>(method_names.count(kDecoderMethodName)));
+
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kEncoderMethodName));
+  encoder_method_loaded_ = true;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
+  decoder_method_loaded_ = true;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(ensure_tokenizer());
+  auto eos_ids = get_eos_ids(tokenizer_.get(), module_.get());
+  if (!eos_ids.empty()) {
+    eos_token_ids_.clear();
+    for (uint64_t eos_id : eos_ids) {
+      eos_token_ids_.insert(static_cast<int64_t>(eos_id));
+    }
+  }
+
+  stats_.model_load_end_ms = ::executorch::extension::llm::time_in_ms();
+
+  return Error::Ok;
+}
+
+Result<std::vector<int64_t>> WhisperRunner::transcribe(
+    ::executorch::extension::TensorPtr preprocessed_features,
+    WhisperTranscribeConfig config,
+    std::function<void(const std::string&)> token_callback) {
+  ET_CHECK_OR_RETURN_ERROR(
+      config.max_new_tokens > 0,
+      InvalidArgument,
+      "max_new_tokens must be positive, got %" PRId64,
+      config.max_new_tokens);
+
+  ET_LOG(
+      Info,
+      "Preprocessed features shape: [%zu, %zu, %zu]",
+      static_cast<size_t>(preprocessed_features->size(0)),
+      static_cast<size_t>(preprocessed_features->size(1)),
+      static_cast<size_t>(preprocessed_features->size(2)));
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  ET_LOG(
+      Info,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      ::executorch::extension::llm::get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Reset internal state and start inference
+  stats_.inference_start_ms = ::executorch::extension::llm::time_in_ms();
+
+  const std::unordered_set<int64_t>* eos_tokens = &eos_token_ids();
+  if (!config.eos_token_ids.empty()) {
+    eos_tokens = &config.eos_token_ids;
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      !eos_tokens->empty(),
+      InvalidArgument,
+      "EOS token set must not be empty.");
+  ::executorch::extension::llm::Sampler sampler(
+      tokenizer_->vocab_size(), config.temperature);
+
+  // Check expected dtype for encoder input
+  auto encoder_method_meta_result = module_->method_meta(kEncoderMethodName);
+  ET_CHECK_OK_OR_RETURN_ERROR(encoder_method_meta_result.error());
+  auto encoder_method_meta = encoder_method_meta_result.get();
+
+  ::executorch::aten::ScalarType expected_dtype =
+      ::executorch::aten::ScalarType::Float;
+  if (encoder_method_meta.num_inputs() > 0) {
+    auto input_meta_result = encoder_method_meta.input_tensor_meta(0);
+    if (input_meta_result.error() == ::executorch::runtime::Error::Ok) {
+      expected_dtype = input_meta_result.get().scalar_type();
+    }
+  }
+
+  // Convert preprocessed_features to expected dtype if needed
+  if (preprocessed_features->scalar_type() != expected_dtype) {
+    if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+      ET_LOG(
+          Info,
+          "Converting audio features from %s to BFloat16. Before converting, first value = %f",
+          ::executorch::runtime::toString(preprocessed_features->scalar_type()),
+          preprocessed_features->mutable_data_ptr<float>()[0]);
+      auto convert_result = ::executorch::extension::llm::convert_to_bfloat16(
+          preprocessed_features);
+      ET_CHECK_OK_OR_RETURN_ERROR(convert_result.error());
+      preprocessed_features = convert_result.get();
+      ET_LOG(
+          Info,
+          "Conversion complete, first value = %f",
+          static_cast<float>(
+              preprocessed_features
+                  ->mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
+    }
+  }
+
+  auto encoder_result =
+      module_->execute(kEncoderMethodName, preprocessed_features);
+  ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error());
+
+  stats_.prompt_eval_end_ms = ::executorch::extension::llm::time_in_ms();
+  stats_.num_prompt_tokens = 0;
+
+  auto encoder_outputs = std::move(*encoder_result);
+  ET_CHECK_OR_RETURN_ERROR(
+      encoder_outputs.size() == 1 && encoder_outputs[0].isTensor(),
+      Internal,
+      "Encoder returned %zu outputs; expected a single tensor.",
+      encoder_outputs.size());
+
+  ::executorch::aten::Tensor encoder_output_tensor =
+      std::move(encoder_outputs[0]).toTensor();
+
+  ET_LOG(
+      Info,
+      "Encoder output shape: [%zu, %zu, %zu]",
+      static_cast<size_t>(encoder_output_tensor.size(0)),
+      static_cast<size_t>(encoder_output_tensor.size(1)),
+      static_cast<size_t>(encoder_output_tensor.size(2)));
+  ET_LOG(
+      Info,
+      "Encoder first value: %f",
+      static_cast<float>(
+          encoder_output_tensor
+              .mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
+
+  auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>(
+      std::move(encoder_output_tensor));
+
+  std::vector<int64_t> tokens = {kDecoderStartTokenId};
+
+  int64_t input_id = kDecoderStartTokenId;
+  int64_t cache_position = 0;
+  int64_t generated_tokens = 0;
+  bool first_token_generated = false;
+  auto decoder_input_ptr = ::executorch::extension::from_blob(
+      &input_id,
+      {static_cast<::executorch::aten::SizesType>(1),
+       static_cast<::executorch::aten::SizesType>(1)},
+      ::executorch::aten::ScalarType::Long);
+
+  auto cache_position_ptr = ::executorch::extension::from_blob(
+      &cache_position,
+      {static_cast<::executorch::aten::SizesType>(1)},
+      ::executorch::aten::ScalarType::Long);
+
+  std::vector<::executorch::runtime::EValue> decoder_inputs;
+  decoder_inputs.reserve(3);
+  decoder_inputs.emplace_back(decoder_input_ptr);
+  decoder_inputs.emplace_back(encoder_output_ptr);
+  decoder_inputs.emplace_back(cache_position_ptr);
+  // Add some green coloring for the first generated token
+  token_callback("\033[1;32m");
+  while (generated_tokens < config.max_new_tokens) {
+    input_id = tokens.back();
+    auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs);
+    ET_CHECK_OK_OR_RETURN_ERROR(decoder_result.error());
+
+    auto decoder_outputs = std::move(*decoder_result);
+    ET_CHECK_OR_RETURN_ERROR(
+        decoder_outputs.size() == 1 && decoder_outputs[0].isTensor(),
+        Internal,
+        "Decoder returned %zu outputs; expected a single tensor.",
+        decoder_outputs.size());
+
+    ::executorch::aten::Tensor logits_tensor =
+        std::move(decoder_outputs[0]).toTensor();
+    const int64_t vocab_size = logits_tensor.numel();
+    ET_CHECK_OR_RETURN_ERROR(
+        vocab_size > 0, Internal, "Decoder logits tensor is empty.");
+
+    const int64_t next_token = static_cast<int64_t>(
+        logits_to_token(logits_tensor, config.temperature));
+
+    if (!first_token_generated) {
+      stats_.first_token_ms = ::executorch::extension::llm::time_in_ms();
+      first_token_generated = true;
+    }
+
+    const int64_t prev_token = input_id;
+    tokens.push_back(next_token);
+    ++generated_tokens;
+    ++cache_position;
+    input_id = next_token;
+
+    if (token_callback) {
+      auto piece_result = tokenizer_->decode(
+          static_cast<uint64_t>(prev_token), static_cast<uint64_t>(next_token));
+      if (piece_result.ok()) {
+        token_callback(piece_result.get());
+      } else {
+        ET_LOG(
+            Error,
+            "Tokenizer failed to decode token pair (%" PRId64 ", %" PRId64
+            ") with error %d",
+            prev_token,
+            next_token,
+            static_cast<int>(piece_result.error()));
+      }
+    }
+
+    if (eos_tokens->count(next_token) > 0) {
+      break;
+    }
+  }
+  // Reset coloring
+  token_callback("\033[0m");
+  // Update stats and print report
+  stats_.num_generated_tokens = generated_tokens;
+  stats_.inference_end_ms = ::executorch::extension::llm::time_in_ms();
+  printf("\n");
+  print_report(stats_);
+
+  return tokens;
+}
+
+} // namespace example
diff --git a/examples/models/whisper/runner.h b/examples/models/whisper/runner.h
new file mode 100644
index 00000000000..c578e42601e
--- /dev/null
+++ b/examples/models/whisper/runner.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+namespace example {
+
+using ::executorch::extension::Module;
+using ::executorch::extension::llm::get_eos_ids;
+using ::executorch::extension::llm::load_tokenizer;
+using ::executorch::extension::llm::print_report;
+using ::executorch::extension::llm::Sampler;
+using ::executorch::extension::llm::Stats;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+/**
+ * Configuration for the Whisper transcription loop.
+ *
+ * max_new_tokens controls the number of tokens generated after the prompt.
+ * Temperature controls the randomness of the output.
+ */
+struct WhisperTranscribeConfig {
+  int64_t max_new_tokens = 128;
+  std::unordered_set<int64_t> eos_token_ids = {};
+  float temperature = 0.0f;
+};
+
+/**
+ * Runner that owns a Whisper encoder + decoder pair exported as a single
+ * ExecuTorch module.
+ *
+ * The module is expected to expose two callable methods:
+ *  - "encoder": processes precomputed audio features into encoder states.
+ *  - "text_decoder": consumes the decoder input ids, encoder output and cache
+ *    positions to autoregressively generate logits.
+ */
+class WhisperRunner {
+ public:
+  WhisperRunner(
+      std::string module_path,
+      std::string data_path,
+      std::string tokenizer_path);
+
+  /**
+   * Returns true when the module and tokenizer are ready for inference.
+   */
+  bool is_loaded() const;
+
+  /**
+   * Loads the module, validates required methods and initialises tokenizer.
+   */
+  ::executorch::runtime::Error load();
+
+  /**
+   * Executes an end-to-end transcription cycle.
+   *
+   * @param preprocessed_features Audio features already processed by a
+   * preprocessor module (see voxtral example).
+   * @param config Controls generation length and termination criteria.
+   * @param token_callback Optional functor invoked for each decoded piece of
+   * text emitted during generation.
+   *
+   * @returns Result containing the final decoder token ids (including the seed
+   * prompt and generated tokens), or an error.
+   */
+  ::executorch::runtime::Result<std::vector<int64_t>> transcribe(
+      ::executorch::extension::TensorPtr preprocessed_features,
+      WhisperTranscribeConfig config = {},
+      std::function<void(const std::string&)> token_callback = {});
+
+ private:
+  ::executorch::runtime::Error ensure_tokenizer();
+  const std::unordered_set<int64_t>& eos_token_ids() const;
+
+  /**
+   * Sample the next token from the logits tensor.
+   * @param logits_tensor The logits tensor.
+   * @param temperature The temperature parameter used to control randomness in
+   * sampling.
+   * @return The next token.
+   */
+  inline int32_t logits_to_token(
+      const executorch::aten::Tensor& logits_tensor,
+      const float temperature = 0.0f) {
+    int32_t result = 0;
+
+    // Create a minimal context for error handling in ET_SWITCH
+    struct {
+      [[noreturn]] void fail(torch::executor::Error /* error */) {
+        ET_CHECK_MSG(false, "Unsupported dtype in logits_to_token");
+      }
+    } ctx;
+
+    ET_SWITCH_FOUR_TYPES(
+        Float,
+        Half,
+        BFloat16,
+        UInt16,
+        logits_tensor.scalar_type(),
+        ctx,
+        "logits_to_token",
+        CTYPE,
+        [&]() {
+          // If the logit_tensor rank is 3, the shape is [batch, seq_length,
+          // vocab_size], get the last logits, sample and return. Else the model
+          // outputs the last logit, directly sample and return.
+          auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
+          ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+          if (logits_tensor.dim() == 3) {
+            auto num_tokens = logits_tensor.size(1);
+            logits += (num_tokens - 1) * vocab_size;
+          }
+          // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+          Sampler sampler(vocab_size, temperature);
+          result = sampler.sample(logits);
+        });
+    return result;
+  }
+
+  std::string module_path_;
+  std::string data_path_;
+  std::string tokenizer_path_;
+
+  std::unique_ptr<Module> module_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
+  std::unordered_set<int64_t> eos_token_ids_;
+
+  bool encoder_method_loaded_ = false;
+  bool decoder_method_loaded_ = false;
+
+  Stats stats_;
+};
+
+} // namespace example
diff --git a/extension/llm/runner/wav_loader.h b/extension/llm/runner/wav_loader.h
index f49a4d1723e..eba37947fff 100644
--- a/extension/llm/runner/wav_loader.h
+++ b/extension/llm/runner/wav_loader.h
@@ -168,18 +168,29 @@ inline std::vector<float> load_wav_audio_data(const std::string& fp) {
   size_t data_offset = header->dataOffset;
   size_t data_size = header->Subchunk2Size;
   int bits_per_sample = header->bitsPerSample;
+  int audio_format = header->AudioFormat;
 
   std::vector<float> audio_data;
 
   if (bits_per_sample == 32) {
     size_t num_samples = data_size / 4;
     audio_data.resize(num_samples);
-    const int32_t* input_buffer =
-        reinterpret_cast<const int32_t*>(data + data_offset);
 
-    for (size_t i = 0; i < num_samples; ++i) {
-      audio_data[i] = static_cast<float>(
-          static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+    if (audio_format == 3) {
+      // IEEE float format - read directly as floats
+      const float* input_buffer =
+          reinterpret_cast<const float*>(data + data_offset);
+      for (size_t i = 0; i < num_samples; ++i) {
+        audio_data[i] = input_buffer[i];
+      }
+    } else {
+      // PCM integer format - normalize from int32
+      const int32_t* input_buffer =
+          reinterpret_cast<const int32_t*>(data + data_offset);
+      for (size_t i = 0; i < num_samples; ++i) {
+        audio_data[i] = static_cast<float>(
+            static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+      }
     }
   } else if (bits_per_sample == 16) {
     size_t num_samples = data_size / 2;
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index dab1a8ab176..9748a36d6e8 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -79,27 +79,27 @@ TensorPtr make_tensor_ptr(
       });
     }
   }
-  std::vector<executorch::aten::StridesType> computed_strides(dim);
-
-  auto error = runtime::dim_order_to_stride(
-      sizes.data(), dim_order.data(), dim, computed_strides.data());
-  ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
-
-  if (!strides.empty()) {
-    for (size_t i = 0; i < dim; i++) {
-      ET_CHECK_MSG(
-          strides[i] == computed_strides[i] || sizes[i] == 1,
-          "invalid strides for dim %zu: %" ET_PRI_SIZES_AND_STRIDES
-          "!= %" ET_PRI_SIZES_AND_STRIDES
-          " while its size is %" ET_PRI_SIZES_AND_STRIDES " != 1",
-          i,
-          strides[i],
-          computed_strides[i],
-          sizes[i]);
-    }
-  }
-
-  strides = std::move(computed_strides);
+  // std::vector<executorch::aten::StridesType> computed_strides(dim);
+
+  // auto error = runtime::dim_order_to_stride(
+  //     sizes.data(), dim_order.data(), dim, computed_strides.data());
+  // ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
+
+  // if (!strides.empty()) {
+  //   for (size_t i = 0; i < dim; i++) {
+  //     ET_CHECK_MSG(
+  //         strides[i] == computed_strides[i] || sizes[i] == 1,
+  //         "invalid strides for dim %zu: %" ET_PRI_SIZES_AND_STRIDES
+  //         "!= %" ET_PRI_SIZES_AND_STRIDES
+  //         " while its size is %" ET_PRI_SIZES_AND_STRIDES " != 1",
+  //         i,
+  //         strides[i],
+  //         computed_strides[i],
+  //         sizes[i]);
+  //   }
+  // }
+
+  // strides = std::move(computed_strides);
 
 #ifndef USE_ATEN_LIB
   executorch::aten::TensorImpl tensor_impl(