-
Notifications
You must be signed in to change notification settings - Fork 348
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add VoiceActivityDetector base class (#199)
* Add VoiceActivityDetector base class Add base class to allow different VAD implementations * fix spaces
- Loading branch information
Andrey Leonov
authored
Sep 5, 2024
1 parent
59aaa4e
commit c03017f
Showing
3 changed files
with
126 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// For licensing see accompanying LICENSE.md file. | ||
// Copyright © 2024 Argmax, Inc. All rights reserved. | ||
|
||
import Foundation | ||
|
||
/// Voice activity detection based on energy threshold | ||
@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) | ||
final class EnergyVAD: VoiceActivityDetector { | ||
var energyThreshold: Float | ||
|
||
/// Initialize a new EnergyVAD instance | ||
/// - Parameters: | ||
/// - sampleRate: Audio sample rate | ||
/// - frameLength: Frame length in seconds | ||
/// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries | ||
/// - energyThreshold: minimal energy threshold | ||
convenience init( | ||
sampleRate: Int = WhisperKit.sampleRate, | ||
frameLength: Float = 0.1, | ||
frameOverlap: Float = 0.0, | ||
energyThreshold: Float = 0.02 | ||
) { | ||
self.init( | ||
sampleRate: sampleRate, | ||
// Compute frame length and overlap in number of samples | ||
frameLengthSamples: Int(frameLength * Float(sampleRate)), | ||
frameOverlapSamples: Int(frameOverlap * Float(sampleRate)), | ||
energyThreshold: energyThreshold | ||
) | ||
} | ||
|
||
required init( | ||
sampleRate: Int = 16000, | ||
frameLengthSamples: Int, | ||
frameOverlapSamples: Int = 0, | ||
energyThreshold: Float = 0.02 | ||
) { | ||
self.energyThreshold = energyThreshold | ||
super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples) | ||
} | ||
|
||
override func voiceActivity(in waveform: [Float]) -> [Bool] { | ||
let chunkRatio = Double(waveform.count) / Double(frameLengthSamples) | ||
|
||
// Round up if uneven, the final chunk will not be a full `frameLengthSamples` long | ||
let count = Int(chunkRatio.rounded(.up)) | ||
|
||
let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks( | ||
of: waveform, | ||
chunkCount: count, | ||
frameLengthSamples: frameLengthSamples, | ||
frameOverlapSamples: frameOverlapSamples, | ||
energyThreshold: energyThreshold | ||
) | ||
|
||
return chunkedVoiceActivity | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters