tetherto · GustavoA1604 · Apr 22, 2026 · Apr 20, 2026 · Apr 21, 2026 · Apr 22, 2026
@@ -5,10 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.8.5]
+
+Performance improvements for the Chatterbox TTS pipeline: reference-audio encoding is done once during `load()` and cached for every `synthesize()` call, the CFG multilingual path runs with a single batched KV cache instead of two separate sessions, and the ONNX Runtime intra-op thread count is now configurable at construction time. On a 4-core CPU the English q4 model drops from RTF ≈ 20.9 to ≈ 15.7 (~25% faster), and every `synthesize()` call sees the same per-call cost — no more first-call penalty.
+
+### Added
+
+- **`numThreads` option** on the `ONNXTTS` constructor. Configures `intraOpThreads` for all Chatterbox ONNX sessions (speech encoder, embed tokens, conditional decoder, language model). Defaults to `0` which preserves the previous behavior (1 intra-op thread). Setting e.g. `4` on a 4-core machine yields ~25% total RTF reduction (LM generation −22%, conditional decoder −30%, speech encoder −25%). Threaded through JS options → `AddonJs` (as string) → `TTSModel::createChatterboxConfig` → `ChatterboxConfig.numThreads` → `OnnxInferSession` constructor.
+- **Speech encoder output caching** in `ChatterboxEngine`. New `SpeechEncoderCache` struct stores audio features, prompt tokens, speaker embeddings, and speaker features produced from the reference audio. The encoder runs once during `load()` and every subsequent `synthesize()` call reuses the cached outputs, so per-call latency is uniform instead of spiking by ~2.7s on the first call. Cache is cleared on `unload()`.
+- **Per-phase timing instrumentation** for Chatterbox. `synthesize()` now emits `QLOG(INFO)` lines with measured durations for speech encoder, LM generation, conditional decoder, and total, so RTF breakdowns can be read straight from logs.
+- **`tensor_ops::concatBatch` / `tensor_ops::duplicateBatch`** tensor helpers in `ChatterboxEngine.hpp`, plus unit tests in `ChatterboxEngineMethodsTest.cpp`.
 
 ### Changed
 
+- **Chatterbox CFG multilingual path** now runs conditional and unconditional branches as a single batched ONNX session call (`[2N, ...]` batch) with one shared KV cache, replacing the previous pair of separate session calls and two KV caches. `generateSpeechTokensWithCfg`, `runInitialCfgStep`, `runCfgGenerationLoop`, and `initEmptyKvCache` were refactored accordingly. Pure performance change — math and output are bit-identical to the pre-batched path.
+- **`prepareCfgEmbeddings`** now prepends audio features via `reserve` + single-pass build instead of `std::vector::insert(begin, ...)`, eliminating O(n) element shifting for every embedding vector built per CFG step.
+- **`trimPromptFromWaveform`** uses `std::move` + `resize` instead of `std::vector::erase(begin, begin + N)`, avoiding an unnecessary copy of the full waveform on every `synthesize()`.
 - Fixed bug when using multilingual model for English inference, bypassing model configuration and allowing input tokens to leak into the output
 
 ### Fixed

@@ -75,6 +75,7 @@ getTTSConfigMap(js_env_t *env, js::Object configurationParams) {
   addBool("useGPU");
   addBool("lazySessionLoading");
   addBool("supertonicMultilingual");
+  addString("numThreads");
 
   // LavaSR enhancement config
   addBool("enhance");

@@ -17,6 +17,45 @@ template <typename T> struct TensorData {
   std::vector<T> data;
 };
 
+namespace tensor_ops {
+
+// Concatenate two tensors along the batch dimension (axis 0).
+// Requires a.shape[i] == b.shape[i] for i > 0.
+// Result shape: [a.shape[0] + b.shape[0], ...rest].
+template <typename T>
+TensorData<T> concatBatch(const TensorData<T> &a, const TensorData<T> &b) {
+  TensorData<T> out;
+  out.shape = a.shape;
+  out.shape[0] = a.shape[0] + b.shape[0];
+  out.data.reserve(a.data.size() + b.data.size());
+  out.data.insert(out.data.end(), a.data.begin(), a.data.end());
+  out.data.insert(out.data.end(), b.data.begin(), b.data.end());
+  return out;
+}
+
+// Duplicate a tensor along the batch dimension (axis 0).
+// Input [N, ...] produces output [2N, ...] by concatenating the input with
+// itself.
+template <typename T> TensorData<T> duplicateBatch(const TensorData<T> &a) {
+  TensorData<T> out;
+  out.shape = a.shape;
+  out.shape[0] = a.shape[0] * 2;
+  out.data.reserve(a.data.size() * 2);
+  out.data.insert(out.data.end(), a.data.begin(), a.data.end());
+  out.data.insert(out.data.end(), a.data.begin(), a.data.end());
+  return out;
+}
+
+} // namespace tensor_ops
+
+struct SpeechEncoderCache {
+  TensorData<float> audioFeatures;
+  TensorData<int64_t> promptToken;
+  TensorData<float> speakerEmbeddings;
+  TensorData<float> speakerFeatures;
+  bool valid = false;
+};
+
 class ChatterboxEngine : public IChatterboxEngine {
 protected:
   // Only for testing
@@ -91,6 +130,13 @@ class ChatterboxEngine : public IChatterboxEngine {
   void ensureSession(std::unique_ptr<IOnnxInferSession> &session,
                      const std::string &modelPath);
   void releaseSession(std::unique_ptr<IOnnxInferSession> &session);
+  void runSpeechEncoderAndCache();
+
+protected:
+  bool hasSpeechEncoderCache() const;
+  void clearSpeechEncoderCache();
+
+private:
   void loadCangjieTableIfNeeded(const std::string &tokenizerPath);
   void loadTextEmbWeight(const std::string &embedTokensPath);
 
@@ -109,11 +155,11 @@ class ChatterboxEngine : public IChatterboxEngine {
   int64_t runInitialCfgStep(
       const TensorData<float> &condEmbs, const TensorData<float> &uncondEmbs,
       TensorData<int64_t> &positionIds, TensorData<int64_t> &attentionMask,
-      std::unordered_map<std::string, TensorData<float>> &condKv,
-      std::unordered_map<std::string, TensorData<float>> &uncondKv,
+      std::unordered_map<std::string, TensorData<float>> &batchedKv,
       std::vector<int64_t> &generatedTokens);
 
-  std::unordered_map<std::string, TensorData<float>> initEmptyKvCache();
+  std::unordered_map<std::string, TensorData<float>>
+  initEmptyKvCache(int64_t batchSize = 1);
 
   void collectKvShapes(
       std::vector<std::vector<int64_t>> &inputShapes,
@@ -135,8 +181,7 @@ class ChatterboxEngine : public IChatterboxEngine {
   void runCfgGenerationLoop(
       std::vector<int64_t> &generatedTokens, TensorData<int64_t> &positionIds,
       TensorData<int64_t> &attentionMask,
-      std::unordered_map<std::string, TensorData<float>> &condKv,
-      std::unordered_map<std::string, TensorData<float>> &uncondKv,
+      std::unordered_map<std::string, TensorData<float>> &batchedKv,
       int maxSpeechTokens);
 
   std::vector<int64_t> generateSpeechTokensWithCfg(
@@ -161,6 +206,9 @@ class ChatterboxEngine : public IChatterboxEngine {
   int64_t textEmbRows_ = 0;
   int64_t textEmbDim_ = 0;
   std::mt19937 rng_{std::random_device{}()};
+
+protected:
+  SpeechEncoderCache speechEncoderCache_;
 };
 
 } // namespace qvac::ttslib::chatterbox
@@ -18,6 +18,7 @@ struct ChatterboxConfig {
   std::string languageModelPath;
   bool lazySessionLoading = false;
   bool useGPU = false;
+  int numThreads = 0;
 };
 
 class IChatterboxEngine {

@@ -77,12 +77,13 @@ OrtElementType onnxTypeToOurType(ONNXTensorElementDataType onnxType) {
 
 } // namespace
 
-OnnxInferSession::OnnxInferSession(const std::string &modelPath, bool useGPU) {
+OnnxInferSession::OnnxInferSession(const std::string &modelPath, bool useGPU,
+                                   int numThreads) {
   onnx_addon::SessionConfig sessionCfg;
   sessionCfg.provider = useGPU ? onnx_addon::ExecutionProvider::AUTO_GPU
                                : onnx_addon::ExecutionProvider::CPU;
   sessionCfg.optimization = onnx_addon::GraphOptimizationLevel::EXTENDED;
-  sessionCfg.intraOpThreads = 1;
+  sessionCfg.intraOpThreads = numThreads > 0 ? numThreads : 1;
 
   Ort::SessionOptions options = onnx_addon::buildSessionOptions(sessionCfg);
 

@@ -8,8 +8,8 @@ namespace qvac::ttslib::chatterbox {
 
 class OnnxInferSession : public IOnnxInferSession {
 public:
-  explicit OnnxInferSession(const std::string &modelPath,
-                            bool useGPU = false);
+  explicit OnnxInferSession(const std::string &modelPath, bool useGPU = false,
+                            int numThreads = 0);
   ~OnnxInferSession() override = default;
 
   void run() override;

@@ -114,6 +114,14 @@ qvac::ttslib::chatterbox::ChatterboxConfig TTSModel::createChatterboxConfig(
     config.useGPU = gpuIt->second == "true";
   }
 
+  auto threadsIt = configMap.find("numThreads");
+  if (threadsIt != configMap.end() && !threadsIt->second.empty()) {
+    try {
+      config.numThreads = std::stoi(threadsIt->second);
+    } catch (...) {
+    }
+  }
+
   std::stringstream ss;
   ss << "Chatterbox config values: language='" << config.language << "'"
      << "' referenceAudio.size()=" << config.referenceAudio.size()

@@ -14,8 +14,12 @@ class TestableChatterboxEngine : public ChatterboxEngine {
   using ChatterboxEngine::advancePositionIds;
   using ChatterboxEngine::assembleSpeechTokenSequence;
   using ChatterboxEngine::buildInitialPositionIds;
+  using ChatterboxEngine::clearSpeechEncoderCache;
   using ChatterboxEngine::convertToAudioResult;
+  using ChatterboxEngine::hasSpeechEncoderCache;
   using ChatterboxEngine::selectNextToken;
+
+  SpeechEncoderCache &getMutableCache() { return speechEncoderCache_; }
 };
 
 class BuildInitialPositionIdsTest : public ::testing::Test {
@@ -229,4 +233,143 @@ TEST_F(SelectNextTokenTest, appliesRepetitionPenalty) {
   EXPECT_EQ(token, 2);
 }
 
+class SpeechEncoderCacheTest : public ::testing::Test {
+protected:
+  TestableChatterboxEngine engine_;
+};
+
+TEST_F(SpeechEncoderCacheTest, cacheIsInitiallyInvalid) {
+  EXPECT_FALSE(engine_.hasSpeechEncoderCache());
+}
+
+TEST_F(SpeechEncoderCacheTest, cacheBecomesValidWhenPopulated) {
+  auto &cache = engine_.getMutableCache();
+  cache.audioFeatures.data = {1.0f, 2.0f};
+  cache.audioFeatures.shape = {1, 2, 1};
+  cache.promptToken.data = {100, 200};
+  cache.promptToken.shape = {1, 2};
+  cache.speakerEmbeddings.data = {0.5f};
+  cache.speakerEmbeddings.shape = {1, 1};
+  cache.speakerFeatures.data = {0.3f};
+  cache.speakerFeatures.shape = {1, 1};
+  cache.valid = true;
+
+  EXPECT_TRUE(engine_.hasSpeechEncoderCache());
+}
+
+TEST_F(SpeechEncoderCacheTest, clearCacheResetsValidity) {
+  auto &cache = engine_.getMutableCache();
+  cache.valid = true;
+  cache.audioFeatures.data = {1.0f};
+
+  engine_.clearSpeechEncoderCache();
+
+  EXPECT_FALSE(engine_.hasSpeechEncoderCache());
+  EXPECT_TRUE(engine_.getMutableCache().audioFeatures.data.empty());
+}
+
+TEST_F(SpeechEncoderCacheTest, defaultCacheStructHasEmptyData) {
+  SpeechEncoderCache cache;
+  EXPECT_FALSE(cache.valid);
+  EXPECT_TRUE(cache.audioFeatures.data.empty());
+  EXPECT_TRUE(cache.promptToken.data.empty());
+  EXPECT_TRUE(cache.speakerEmbeddings.data.empty());
+  EXPECT_TRUE(cache.speakerFeatures.data.empty());
+}
+
+class TensorOpsConcatBatchTest : public ::testing::Test {};
+
+TEST_F(TensorOpsConcatBatchTest, concatenatesFloatTensorsAlongBatchDim) {
+  TensorData<float> a;
+  a.shape = {1, 2, 3};
+  a.data = {1, 2, 3, 4, 5, 6};
+
+  TensorData<float> b;
+  b.shape = {1, 2, 3};
+  b.data = {7, 8, 9, 10, 11, 12};
+
+  auto result = tensor_ops::concatBatch(a, b);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{2, 2, 3}));
+  EXPECT_EQ(result.data,
+            (std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
+TEST_F(TensorOpsConcatBatchTest, concatenatesInt64TensorsAlongBatchDim) {
+  TensorData<int64_t> a;
+  a.shape = {1, 3};
+  a.data = {10, 20, 30};
+
+  TensorData<int64_t> b;
+  b.shape = {1, 3};
+  b.data = {40, 50, 60};
+
+  auto result = tensor_ops::concatBatch(a, b);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{2, 3}));
+  EXPECT_EQ(result.data, (std::vector<int64_t>{10, 20, 30, 40, 50, 60}));
+}
+
+TEST_F(TensorOpsConcatBatchTest, handlesEmptySequenceDimension) {
+  TensorData<float> a;
+  a.shape = {1, 16, 0, 64};
+  TensorData<float> b;
+  b.shape = {1, 16, 0, 64};
+
+  auto result = tensor_ops::concatBatch(a, b);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{2, 16, 0, 64}));
+  EXPECT_TRUE(result.data.empty());
+}
+
+TEST_F(TensorOpsConcatBatchTest, preservesMultiBatchInput) {
+  TensorData<float> a;
+  a.shape = {2, 1};
+  a.data = {1, 2};
+
+  TensorData<float> b;
+  b.shape = {3, 1};
+  b.data = {3, 4, 5};
+
+  auto result = tensor_ops::concatBatch(a, b);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{5, 1}));
+  EXPECT_EQ(result.data, (std::vector<float>{1, 2, 3, 4, 5}));
+}
+
+class TensorOpsDuplicateBatchTest : public ::testing::Test {};
+
+TEST_F(TensorOpsDuplicateBatchTest, doublesBatchDimForFloat) {
+  TensorData<float> a;
+  a.shape = {1, 2, 2};
+  a.data = {1.5f, 2.5f, 3.5f, 4.5f};
+
+  auto result = tensor_ops::duplicateBatch(a);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{2, 2, 2}));
+  EXPECT_EQ(result.data, (std::vector<float>{1.5f, 2.5f, 3.5f, 4.5f, 1.5f, 2.5f,
+                                             3.5f, 4.5f}));
+}
+
+TEST_F(TensorOpsDuplicateBatchTest, doublesBatchDimForInt64) {
+  TensorData<int64_t> a;
+  a.shape = {1, 4};
+  a.data = {1, 1, 1, 1};
+
+  auto result = tensor_ops::duplicateBatch(a);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{2, 4}));
+  EXPECT_EQ(result.data, (std::vector<int64_t>{1, 1, 1, 1, 1, 1, 1, 1}));
+}
+
+TEST_F(TensorOpsDuplicateBatchTest, preservesEmptyPastSequence) {
+  TensorData<float> a;
+  a.shape = {1, 16, 0, 64};
+
+  auto result = tensor_ops::duplicateBatch(a);
+
+  EXPECT_EQ(result.shape, (std::vector<int64_t>{2, 16, 0, 64}));
+  EXPECT_TRUE(result.data.empty());
+}
+
 } // namespace qvac::ttslib::chatterbox::testing
@@ -108,6 +108,13 @@ declare interface ONNXTTSOptions {
   speed?: number
   numInferenceSteps?: number
   supertonicMultilingual?: boolean
+  /**
+   * Chatterbox: ONNX Runtime intra-op thread count for all sessions (speech encoder,
+   * embed tokens, language model, conditional decoder). `0` (default) preserves the
+   * previous behavior of 1 intra-op thread; higher values trade CPU for throughput
+   * (e.g. `4` yields ~25% lower RTF on a 4-core machine).
+   */
+  numThreads?: number
   opts?: object
   exclusiveRun?: boolean
 }

@@ -142,6 +142,7 @@ class ONNXTTS {
       speed,
       numInferenceSteps,
       supertonicMultilingual,
+      numThreads,
       opts,
       exclusiveRun
     } = options
@@ -241,6 +242,7 @@ class ONNXTTS {
         this._languageModelPath = normalizedFiles.languageModel
       }
       this._referenceAudio = referenceAudio
+      this._numThreads = numThreads != null ? numThreads : 0
     } else {
       this._modelDir = normalizedFiles.modelDir
       this._voiceName = voiceName ?? 'F1'
@@ -678,7 +680,8 @@ class ONNXTTS {
         languageModelPath: this._languageModelPath || '',
         language: this._config?.language || 'en',
         useGPU: this._config?.useGPU || false,
-        lazySessionLoading: this._lazySessionLoading
+        lazySessionLoading: this._lazySessionLoading,
+        numThreads: String(this._numThreads || 0)
       }
       if (this._referenceAudio != null) {
         ttsParams.referenceAudio = this._referenceAudio
@@ -947,7 +950,8 @@ class ONNXTTS {
         languageModelPath: this._languageModelPath || '',
         language: this._config?.language || 'en',
         useGPU: this._config?.useGPU || false,
-        lazySessionLoading: this._lazySessionLoading
+        lazySessionLoading: this._lazySessionLoading,
+        numThreads: String(this._numThreads || 0)
       }
       if (this._referenceAudio != null) {
         ttsParams.referenceAudio = this._referenceAudio

@@ -1,6 +1,6 @@
 {
   "name": "@qvac/tts-onnx",
-  "version": "0.8.4",
+  "version": "0.8.5",
   "description": "Text to Speech (TTS) addon for qvac",
   "addon": true,
   "engines": {