Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,16 @@ extern "C" {
const float * samples,
int n_samples);

// Like whisper_vad_detect_speech, but does not reset LSTM state.
// Use for streaming: call whisper_vad_reset_state() between utterances.
WHISPER_API bool whisper_vad_detect_speech_no_reset(
struct whisper_vad_context * vctx,
const float * samples,
int n_samples);

// Reset LSTM hidden/cell states to zero.
WHISPER_API void whisper_vad_reset_state(struct whisper_vad_context * vctx);

WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx);
WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx);

Expand Down
17 changes: 13 additions & 4 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5083,7 +5083,11 @@ struct whisper_vad_context * whisper_vad_init_with_params(
return vctx;
}

bool whisper_vad_detect_speech(
void whisper_vad_reset_state(whisper_vad_context * vctx) {
ggml_backend_buffer_clear(vctx->buffer, 0);
}

bool whisper_vad_detect_speech_no_reset(
struct whisper_vad_context * vctx,
const float * samples,
int n_samples) {
Expand All @@ -5095,9 +5099,6 @@ bool whisper_vad_detect_speech(
WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
WHISPER_LOG_INFO("%s: n_chunks: %d\n", __func__, n_chunks);

// Reset LSTM hidden/cell states
ggml_backend_buffer_clear(vctx->buffer, 0);

vctx->probs.resize(n_chunks);
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);

Expand Down Expand Up @@ -5165,6 +5166,14 @@ bool whisper_vad_detect_speech(
return true;
}

bool whisper_vad_detect_speech(
struct whisper_vad_context * vctx,
const float * samples,
int n_samples) {
whisper_vad_reset_state(vctx);
return whisper_vad_detect_speech_no_reset(vctx, samples, n_samples);
}

int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments) {
return segments->data.size();
}
Expand Down