diff --git a/packages/qvac-lib-infer-whispercpp/CHANGELOG.md b/packages/qvac-lib-infer-whispercpp/CHANGELOG.md index 49f197ae6d..812a2fb15d 100644 --- a/packages/qvac-lib-infer-whispercpp/CHANGELOG.md +++ b/packages/qvac-lib-infer-whispercpp/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.2] + +### Changed +- Fixed chunking issue re-introduced in 0.6.0 in which the inference output was not streamed but instead returned as a single batched result of the end. + +## [0.6.1] + +### Changed + +- Changed `@qvac/transcription-whispercpp` package visibility on NPM from private to public + ## [0.6.0] This release is a significant interface modernisation. The constructor switches to a local-files map, model download is removed from the load path, concurrent inference runs are serialised instead of rejected, and the class no longer extends `BaseInference`. @@ -52,6 +63,11 @@ When `exclusiveRun` is enabled (the default), a second call to `run()` or `runSt `TranscriptionWhispercppFiles` and `InferenceClientState` are now exported from the `TranscriptionWhispercpp` namespace. Lifecycle methods (`load`, `unload`, `destroy`, `cancel`, `pause`, `unpause`, `stop`, `status`, `getState`) are now explicitly declared in `index.d.ts`. +## [0.5.6] + +### Changed +- Fixed chunking issue introduced in 0.5.0 in which the inference output was not streamed but instead returned as a single batched result of the end. + ## [0.5.5] ### Changed diff --git a/packages/qvac-lib-infer-whispercpp/addon/src/addon/AddonJs.hpp b/packages/qvac-lib-infer-whispercpp/addon/src/addon/AddonJs.hpp index d30029b11a..4492c3e6fd 100644 --- a/packages/qvac-lib-infer-whispercpp/addon/src/addon/AddonJs.hpp +++ b/packages/qvac-lib-infer-whispercpp/addon/src/addon/AddonJs.hpp @@ -155,7 +155,14 @@ inline js_value_t* runJob(js_env_t* env, js_callback_info_t* info) try { vector audioBytes = js::TypedArray(env, jsInput).as>(env); auto samples = WhisperModel::preprocessAudioData(audioBytes, audioFormat); - return instance.runJob(std::any(std::move(samples))); + + WhisperModel::AnyInput anyInput; + anyInput.input = std::move(samples); + anyInput.outputCallback = [&instance](const Transcript& transcript) { + instance.addonCpp->outputQueue->queueResult(std::any(transcript)); + }; + + return instance.runJob(std::any(std::move(anyInput))); } JSCATCH diff --git a/packages/qvac-lib-infer-whispercpp/addon/src/model-interface/whisper.cpp/WhisperModel.cpp b/packages/qvac-lib-infer-whispercpp/addon/src/model-interface/whisper.cpp/WhisperModel.cpp index 2b78ce06bd..bc84d158e1 100644 --- a/packages/qvac-lib-infer-whispercpp/addon/src/model-interface/whisper.cpp/WhisperModel.cpp +++ b/packages/qvac-lib-infer-whispercpp/addon/src/model-interface/whisper.cpp/WhisperModel.cpp @@ -389,6 +389,7 @@ std::any WhisperModel::process(const std::any& input) { if (shouldOverrideCallback) { on_segment_ = previousOutputCallback; + return Output{}; } return output_; diff --git a/packages/qvac-lib-infer-whispercpp/package.json b/packages/qvac-lib-infer-whispercpp/package.json index 0992a07e0e..5bfeb082b8 100644 --- a/packages/qvac-lib-infer-whispercpp/package.json +++ b/packages/qvac-lib-infer-whispercpp/package.json @@ -1,6 +1,6 @@ { "name": "@qvac/transcription-whispercpp", - "version": "0.6.0", + "version": "0.6.2", "description": "transcription addon for qvac", "addon": true, "engines": { diff --git a/packages/qvac-lib-infer-whispercpp/test/integration/audio-ctx-chunking.test.js b/packages/qvac-lib-infer-whispercpp/test/integration/audio-ctx-chunking.test.js index 54650117d7..7869c02f7e 100644 --- a/packages/qvac-lib-infer-whispercpp/test/integration/audio-ctx-chunking.test.js +++ b/packages/qvac-lib-infer-whispercpp/test/integration/audio-ctx-chunking.test.js @@ -14,19 +14,21 @@ async function transcribeChunk (model, audioStream, offsetMs, durationMs, audioC } }) - // audioStream is provided by caller to avoid reading the whole file inside this function - const response = await model.run(audioStream) const results = [] + let updateCallCount = 0 + let maxBatchSize = 0 response.onUpdate((outputArr) => { + updateCallCount++ const items = Array.isArray(outputArr) ? outputArr : [outputArr] + if (items.length > maxBatchSize) maxBatchSize = items.length results.push(...items) }) await response.await() - return results + return { results, updateCallCount, maxBatchSize } } const { modelPath } = getTestPaths() @@ -99,6 +101,7 @@ test('Audio context chunking - 10 minute audio file with 30s chunks', { skip: is const allResults = [] let errorCount = 0 let chunksWithSegments = 0 + let batchedDeliveryCount = 0 // Process each chunk - always pass full audio, only change offset_ms, duration_ms, audio_ctx let currentOffsetSeconds = 0 @@ -110,9 +113,9 @@ test('Audio context chunking - 10 minute audio file with 30s chunks', { skip: is const fullAudioStream = createAudioStream(fullAudioBuffer) - let results = [] + let chunk = { results: [], updateCallCount: 0, maxBatchSize: 0 } try { - results = await transcribeChunk( + chunk = await transcribeChunk( model, fullAudioStream, currentOffsetSeconds * 1000, @@ -126,11 +129,16 @@ test('Audio context chunking - 10 minute audio file with 30s chunks', { skip: is currentOffsetSeconds += chunkDuration - if (results.length > 0) { - const text = results.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim() + if (chunk.results.length > 0) { + const text = chunk.results.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim() + console.log(` → segments=${chunk.results.length} updates=${chunk.updateCallCount} maxBatch=${chunk.maxBatchSize}`) console.log(` → ${text}\n`) - allResults.push(...results) + allResults.push(...chunk.results) chunksWithSegments++ + + if (chunk.results.length > 1 && chunk.updateCallCount === 1) { + batchedDeliveryCount++ + } } else { console.log(' → [no output]\n') } @@ -141,6 +149,7 @@ test('Audio context chunking - 10 minute audio file with 30s chunks', { skip: is console.log(`Total chunks processed: ${totalChunks}`) console.log(`Chunks with segments: ${chunksWithSegments}`) console.log(`Chunk errors: ${errorCount}`) + console.log(`Batched deliveries (regression): ${batchedDeliveryCount}`) console.log(`Duration processed: ${totalDurationSeconds.toFixed(1)}s`) // Assertions @@ -148,6 +157,11 @@ test('Audio context chunking - 10 minute audio file with 30s chunks', { skip: is t.is(chunksWithSegments, totalChunks, 'Should transcribe exactly totalChunks chunks') t.is(errorCount, 0, 'No chunk errors or exceptions') + t.is( + batchedDeliveryCount, 0, + 'Segments must be streamed incrementally (not all batched into a single onUpdate call)' + ) + // Verify segments have required properties if (allResults.length > 0) { const firstSegment = allResults[0] diff --git a/packages/qvac-lib-infer-whispercpp/test/unit/addon.inference.test.js b/packages/qvac-lib-infer-whispercpp/test/unit/addon.inference.test.js index 10eb05d288..d2619e07ba 100644 --- a/packages/qvac-lib-infer-whispercpp/test/unit/addon.inference.test.js +++ b/packages/qvac-lib-infer-whispercpp/test/unit/addon.inference.test.js @@ -117,7 +117,7 @@ test('Streaming transcript output preserves segment ordering', async (t) => { const outputEvents = events.filter(e => e.event === 'Output' && e.jobId === 1) t.alike( - outputEvents.map(e => e.output.text), + outputEvents.map(e => e.output[0].text), ['segment-0', 'segment-1', 'segment-2'], 'Output segments should keep original ordering' ) diff --git a/packages/qvac-lib-infer-whispercpp/test/unit/vad.test.js b/packages/qvac-lib-infer-whispercpp/test/unit/vad.test.js index 9403d44344..2f45d73dc6 100644 --- a/packages/qvac-lib-infer-whispercpp/test/unit/vad.test.js +++ b/packages/qvac-lib-infer-whispercpp/test/unit/vad.test.js @@ -78,9 +78,10 @@ test('VAD mode processes audio with voice activity detection', async (t) => { if (outputEvents.length > 0) { t.ok(outputEvents[0].output, 'Should have transcription output') - t.is(typeof outputEvents[0].output, 'object', 'Output should be transcript object') - t.ok(outputEvents[0].output.text.includes('Mock transcription') || - outputEvents[0].output.text.includes('Silent audio detected'), + t.ok(Array.isArray(outputEvents[0].output), 'Output should be wrapped in array') + const transcript = outputEvents[0].output[0] + t.ok(transcript.text.includes('Mock transcription') || + transcript.text.includes('Silent audio detected'), 'Should contain mock transcription or silence detection text') } diff --git a/packages/qvac-lib-infer-whispercpp/whisper.js b/packages/qvac-lib-infer-whispercpp/whisper.js index b2d6bc658a..fd34a4ae85 100644 --- a/packages/qvac-lib-infer-whispercpp/whisper.js +++ b/packages/qvac-lib-infer-whispercpp/whisper.js @@ -90,6 +90,22 @@ class WhisperInterface { if (mappedEvent === 'Output') { this._setState(state.PROCESSING) + if (this._outputCb != null) { + const isTranscriptArray = Array.isArray(data) && data.length > 0 && + typeof data[0]?.text === 'string' + const isSingleTranscript = !Array.isArray(data) && + data && typeof data === 'object' && typeof data.text === 'string' + if (isTranscriptArray) { + for (const segment of data) { + this._outputCb(addon, 'Output', jobId, [segment], null) + } + } else if (isSingleTranscript) { + this._outputCb(addon, 'Output', jobId, [data], null) + } else { + this._outputCb(addon, 'Output', jobId, data, null) + } + } + return } if (this._outputCb != null) {