Skip to content

Commit

Permalink
Fix audio processing edge case (#237)
Browse files Browse the repository at this point in the history
* Fix xcconfig tracking

* Add package.swift docs to readme

* Fix edge case where framePosition does not align with actual frame count of AVAudioFile

* Upgrade github runner macos version

* Update remaining github runner versions

* Use WERUtils to check vad accuracy

* Reduce calls to frameposition

* Fix xcode version for runner
  • Loading branch information
ZachNagengast authored Nov 1, 2024
1 parent a9b92c4 commit dd2eb73
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 29 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/development-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ jobs:
name: "Build and Test"
uses: ./.github/workflows/unit-tests.yml
with:
ios-version: "17.2"
macos-runner: "macos-14"
ios-version: "18.1"
macos-runner: "macos-15"

check-approvals:
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pre-release-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
include:
- os: macos-13-xlarge
ios-version: "16.1" # Oldest available version
- os: macos-14
ios-version: "17.2" # Latest available version
- os: macos-15
ios-version: "18.1" # Latest available version
uses: ./.github/workflows/unit-tests.yml
with:
ios-version: ${{ matrix.ios-version }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ jobs:
name: "iOS",
condition: true,
clean-destination: "generic/platform=iOS",
test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 15",
test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 16",
}
- {
name: "watchOS",
condition: "${{ inputs.macos-runner == 'macos-14' }}",
condition: "${{ inputs.macos-runner == 'macos-15' }}",
clean-destination: "generic/platform=watchOS",
test-destination: "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)",
test-destination: "platform=watchOS Simulator,OS=11.1,name=Apple Watch Ultra 2 (49mm)",
}
- {
name: "visionOS",
condition: "${{ inputs.macos-runner == 'macos-14' }}",
condition: "${{ inputs.macos-runner == 'macos-15' }}",
clean-destination: "generic/platform=visionOS",
test-destination: "platform=visionOS Simulator,name=Apple Vision Pro",
}
Expand All @@ -46,7 +46,7 @@ jobs:
- uses: actions/checkout@v4
- uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: "15.2"
xcode-version: latest-stable
- name: Setup environment
run: make setup
- name: Setup Cache
Expand All @@ -66,7 +66,7 @@ jobs:
echo "Destinations for testing:"
xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations
- name: Boot Simulator and Wait
if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-14' }}
if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }}
# Slower runners require some time to fully boot the simulator
# Parse the simulator name from the destination string, boot it, and wait
run: |
Expand Down
12 changes: 7 additions & 5 deletions Sources/WhisperKit/Core/Audio/AudioProcessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,15 @@ public class AudioProcessor: NSObject, AudioProcessing {
}

let inputBuffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: maxReadFrameSize)!

while audioFile.framePosition < endFramePosition {
let remainingFrames = AVAudioFrameCount(endFramePosition - audioFile.framePosition)
var nextPosition = inputStartFrame
while nextPosition < endFramePosition {
let framePosition = audioFile.framePosition
let remainingFrames = AVAudioFrameCount(endFramePosition - framePosition)
let framesToRead = min(remainingFrames, maxReadFrameSize)
nextPosition = framePosition + Int64(framesToRead)

let currentPositionInSeconds = Double(audioFile.framePosition) / inputSampleRate
let nextPositionInSeconds = (Double(audioFile.framePosition) + Double(framesToRead)) / inputSampleRate
let currentPositionInSeconds = Double(framePosition) / inputSampleRate
let nextPositionInSeconds = Double(nextPosition) / inputSampleRate
Logging.debug("Resampling \(String(format: "%.2f", currentPositionInSeconds))s - \(String(format: "%.2f", nextPositionInSeconds))s")

do {
Expand Down
10 changes: 10 additions & 0 deletions Tests/WhisperKitTests/Evaluate/WERUtils.swift
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,14 @@ enum WERUtils {
let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript)
return diff
}

static func diffString(from diff: [[String?]]) -> String {
return diff.compactMap { entry -> String? in
guard let word = entry[0], word != " " else { return nil }
if let changeType = entry[1] {
return "\(changeType)\(word)"
}
return word
}.joined(separator: " ")
}
}
28 changes: 14 additions & 14 deletions Tests/WhisperKitTests/UnitTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1416,31 +1416,31 @@ final class UnitTests: XCTestCase {
}

func testVADAudioChunkerAccuracy() async throws {
let testResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: DecodingOptions(), audioFile: "ted_60.m4a"),
let options = DecodingOptions(temperatureFallbackCount: 0, chunkingStrategy: .vad)

let chunkedResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
"Failed to transcribe"
)

let options = DecodingOptions(chunkingStrategy: .vad)
let clipTimestamps = chunkedResult.compactMap(\.seekTime)
XCTAssertEqual(clipTimestamps, [0, 22.9, 39], "Clip timestamps should match the expected values, found \(clipTimestamps)")

let chunkedResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
// Run the test using same seek values for accuracy comparison
let testResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: DecodingOptions(temperatureFallbackCount: 0, clipTimestamps: [0, 22.9, 22.9, 39, 39, 60]), audioFile: "ted_60.m4a"),
"Failed to transcribe"
)

XCTAssertFalse(testResult.text.isEmpty, "The test text should not be empty")
XCTAssertFalse(chunkedResult.text.isEmpty, "The chunked text should not be empty")

// Select few sentences to compare at VAD border
// TODO: test that WER is in acceptable range
// XCTAssertTrue(testResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(testResult.text.normalized)")
// XCTAssertTrue(chunkedResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
//
// XCTAssertTrue(testResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(testResult.text.normalized)")
// XCTAssertTrue(chunkedResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
// Check WER for the full audio and the chunked audio
let (wer, diff) = WERUtils.evaluate(originalTranscript: testResult.text, generatedTranscript: chunkedResult.text)

let diffDescription = WERUtils.diffString(from: diff)

XCTAssertTrue(testResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(testResult.text.normalized)")
XCTAssertTrue(chunkedResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
XCTAssertEqual(wer, 0.0, "Transcripts should match with a WER of 0, found \(wer). Full diff: \(diffDescription)")
}

#if !os(watchOS) // FIXME: This test times out on watchOS when run on low compute runners
Expand Down

0 comments on commit dd2eb73

Please sign in to comment.