pytorch · larryliu0820 · Oct 22, 2025 · Oct 22, 2025
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
@@ -16,6 +16,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
 
 
@@ -56,6 +57,18 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
+        # Tag constant placeholders that have no users
+        # tag_constant_data only tags constants that have users with delegation_tag
+        # but we need to tag all constants for this partition
+        for node in exported_program.graph.nodes:
+            if node.op == "placeholder" and (
+                is_param(exported_program, node)
+                or is_buffer(exported_program, node)
+                or is_lifted_tensor_constant(exported_program, node)
+            ):
+                if "delegation_tag" not in node.meta:
+                    node.meta["delegation_tag"] = tag
+
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )

@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.24)
+project(whisper_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+set(_srcs multimodal.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Add LLM runner and extension module
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# Add the required ExecuTorch extensions for multimodal LLM runner
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
+if(EXECUTORCH_BUILD_METAL)
+  list(APPEND link_libraries metal_backend)
+  executorch_target_link_options_shared_lib(metal_backend)
+endif()
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(whisper_runner runner.cpp main.cpp)
+
+target_include_directories(whisper_runner PUBLIC ${_common_include_directories})
+
+target_link_libraries(whisper_runner PUBLIC ${link_libraries})
+target_compile_options(whisper_runner PUBLIC ${_common_compile_options})
@@ -0,0 +1,69 @@
+# Whisper Runner
+
+This directory hosts a lightweight C++ helper that drives Whisper models
+exported to ExecuTorch. The `WhisperRunner` owns the `Module` instance that
+wraps a bundled `.pte` program and optional `.ptd` weight file, loads the
+`encoder` and `text_decoder` methods, and exposes a `transcribe()` loop that
+streams decoded text pieces through a callback.
+
+The runner assumes:
+- `model.pte` contains both Whisper encoder and decoder entry points named
+  `encoder` and `text_decoder`.
+- External parameters (for example KV cache blocks) are stored in a companion
+  `model.ptd`.
+- A tokenizer JSON compatible with the ExecuTorch tokenizers shim is available.
+
+Audio preprocessing is not part of the runner itself. To transform raw audio
+into the mel features expected by the encoder, reuse the pattern in
+`examples/models/voxtral/multimodal.cpp`, which loads a `preprocessor.pte`
+module to generate the spectrogram tensor.
+
+## Build
+
+```bash
+cmake -G Ninja \
+  -B cmake-out/examples/models/whisper \
+  -S examples/models/whisper
+cmake --build cmake-out/examples/models/whisper -j
+```
+
+The build produces a static library named `whisper_runner`. Link it into your
+application together with the standard ExecuTorch runtime libraries and the
+tokenizer target (`tokenizers::tokenizers`).
+
+## Usage
+
+```cpp
+#include <executorch/examples/models/whisper/runner.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+using example::WhisperRunner;
+using example::WhisperTranscribeConfig;
+
+WhisperRunner runner("model.pte", "model.ptd", "tokenizer.json");
+ET_CHECK_OK(runner.load());
+
+// `features` is the mel spectrogram tensor produced by the preprocessor.
+executorch::aten::Tensor features = load_features_somehow();
+
+WhisperTranscribeConfig config;
+config.max_new_tokens = 128; // stop after 128 generated tokens
+config.temperature = 0.7f;  // optional: enable stochastic sampling
+
+auto tokens_result = runner.transcribe(
+    features,
+    config,
+    [](const std::string& piece) {
+      std::cout << piece;
+    });
+
+if (!tokens_result.ok()) {
+  ET_LOG(Error, "Transcription failed: %d", static_cast<int>(tokens_result.error()));
+}
+```
+
+`transcribe()` returns the full token history (prompt + generated tokens) and
+invokes the callback every time a new token is emitted. Provide a non-empty
+`decoder_input_ids` vector if you want to seed the decoder with a custom prompt,
+and override `WhisperTranscribeConfig::eos_token_ids` when the model exposes
+custom termination ids.