Skip to content

Commit

Permalink
Add Faster Whisper (#400)
Browse files Browse the repository at this point in the history
  • Loading branch information
chidiwilliams authored Apr 10, 2023
1 parent b807302 commit 86626aa
Show file tree
Hide file tree
Showing 7 changed files with 419 additions and 57 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)

<blockquote>
<p>Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more for $9.99.</p>
<p>Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more.</p>
<a href="https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://tools.applemediaservices.com/api/badges/download-on-the-mac-app-store/black/en-us?size=250x83&amp;releaseDate=1679529600" alt="Download on the Mac App Store" /></a>
</blockquote>

Expand All @@ -23,7 +23,7 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
- Import audio and video files and export transcripts to TXT, SRT, and
VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
- Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
[Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
- Available on Mac, Windows, and Linux
Expand Down
8 changes: 7 additions & 1 deletion buzz/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ class DownloadModelProgressDialog(QProgressDialog):
def __init__(self, parent: Optional[QWidget], *args) -> None:
super().__init__(_('Downloading model (0%, unknown time remaining)'),
_('Cancel'), 0, 100, parent, *args)

# Setting this to a high value to avoid showing the dialog for models that
# are checked locally but set progress to 0 immediately, i.e. Hugging Face or Faster Whisper models
self.setMinimumDuration(10_000)

self.setWindowModality(Qt.WindowModality.ApplicationModal)
self.start_time = datetime.now()
self.setFixedSize(self.size())
Expand Down Expand Up @@ -1373,7 +1378,8 @@ def reset_visible_rows(self):
model_type = self.transcription_options.model.model_type
self.form_layout.setRowVisible(self.hugging_face_search_line_edit, model_type == ModelType.HUGGING_FACE)
self.form_layout.setRowVisible(self.whisper_model_size_combo_box,
(model_type == ModelType.WHISPER) or (model_type == ModelType.WHISPER_CPP))
(model_type == ModelType.WHISPER) or (model_type == ModelType.WHISPER_CPP) or (
model_type == ModelType.FASTER_WHISPER))
self.form_layout.setRowVisible(self.openai_access_token_edit, model_type == ModelType.OPEN_AI_WHISPER_API)

def on_model_type_changed(self, text: str):
Expand Down
7 changes: 7 additions & 0 deletions buzz/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dataclasses import dataclass
from typing import Optional

import faster_whisper
import requests
import whisper
from PyQt6.QtCore import QObject, pyqtSignal, pyqtSlot
Expand All @@ -26,6 +27,7 @@ class ModelType(enum.Enum):
WHISPER = 'Whisper'
WHISPER_CPP = 'Whisper.cpp'
HUGGING_FACE = 'Hugging Face'
FASTER_WHISPER = 'Faster Whisper'
OPEN_AI_WHISPER_API = 'OpenAI Whisper API'


Expand Down Expand Up @@ -99,6 +101,11 @@ def run(self):
elif self.model_type == ModelType.OPEN_AI_WHISPER_API:
file_path = ""

elif self.model_type == ModelType.FASTER_WHISPER:
self.progress.emit((0, 100))
file_path = faster_whisper.download_model(size=self.whisper_model_size.value)
self.progress.emit((100, 100))

else:
raise Exception("Invalid model type: " + self.model_type.value)

Expand Down
141 changes: 100 additions & 41 deletions buzz/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
from random import randint
from threading import Thread
from typing import Any, List, Optional, Tuple, Union, Set

import faster_whisper
import openai

import ffmpeg
import numpy as np
import sounddevice
import stable_whisper
import tqdm
import whisper
from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread
from sounddevice import PortAudioError
Expand Down Expand Up @@ -431,7 +434,7 @@ def transcribe(self) -> List[Segment]:

recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)

self.current_process = multiprocessing.Process(target=transcribe_whisper,
self.current_process = multiprocessing.Process(target=self.transcribe_whisper,
args=(send_pipe, self.transcription_task))
if not self.stopped:
self.current_process.start()
Expand All @@ -457,6 +460,97 @@ def transcribe(self) -> List[Segment]:

return self.segments

@classmethod
def transcribe_whisper(cls, stderr_conn: Connection, task: FileTranscriptionTask) -> None:
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
segments = cls.transcribe_hugging_face(task)
elif task.transcription_options.model.model_type == ModelType.FASTER_WHISPER:
segments = cls.transcribe_faster_whisper(task)
elif task.transcription_options.model.model_type == ModelType.WHISPER:
segments = cls.transcribe_openai_whisper(task)
else:
raise Exception(f"Invalid model type: {task.transcription_options.model.model_type}")

segments_json = json.dumps(
segments, ensure_ascii=True, default=vars)
sys.stderr.write(f'segments = {segments_json}\n')
sys.stderr.write(
WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + '\n')

@classmethod
def transcribe_hugging_face(cls, task: FileTranscriptionTask) -> List[Segment]:
model = transformers_whisper.load_model(task.model_path)
language = task.transcription_options.language if task.transcription_options.language is not None else 'en'
result = model.transcribe(audio=task.file_path, language=language,
task=task.transcription_options.task.value, verbose=False)
return [
Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in result.get('segments')]

@classmethod
def transcribe_faster_whisper(cls, task: FileTranscriptionTask) -> List[Segment]:
model = faster_whisper.WhisperModel(
model_size_or_path=task.transcription_options.model.whisper_model_size.value)
whisper_segments, info = model.transcribe(audio=task.file_path,
language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt,
word_timestamps=task.transcription_options.word_level_timings)
segments = []
with tqdm.tqdm(total=round(info.duration, 2), unit=' seconds') as pbar:
for segment in list(whisper_segments):
# Segment will contain words if word-level timings is True
if segment.words:
for word in segment.words:
segments.append(Segment(
start=int(word.start * 1000),
end=int(word.end * 1000),
text=word.word
))
else:
segments.append(Segment(
start=int(segment.start * 1000),
end=int(segment.end * 1000),
text=segment.text
))

pbar.update(segment.end - segment.start)
return segments

@classmethod
def transcribe_openai_whisper(cls, task: FileTranscriptionTask) -> List[Segment]:
model = whisper.load_model(task.model_path)

if task.transcription_options.word_level_timings:
stable_whisper.modify_model(model)
result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value, temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, pbar=True)
segments = stable_whisper.group_word_timestamps(result)
return [Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in segments]

result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, verbose=False)
segments = result.get('segments')
return [Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in segments]

def stop(self):
self.stopped = True
if self.started_process:
Expand Down Expand Up @@ -489,44 +583,6 @@ def read_line(self, pipe: Connection):
continue


def transcribe_whisper(stderr_conn: Connection, task: FileTranscriptionTask):
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
model = transformers_whisper.load_model(task.model_path)
language = task.transcription_options.language if task.transcription_options.language is not None else 'en'
result = model.transcribe(audio=task.file_path, language=language,
task=task.transcription_options.task.value, verbose=False)
whisper_segments = result.get('segments')
else:
model = whisper.load_model(task.model_path)
if task.transcription_options.word_level_timings:
stable_whisper.modify_model(model)
result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value, temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, pbar=True)
whisper_segments = stable_whisper.group_word_timestamps(result)
else:
result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, verbose=False)
whisper_segments = result.get('segments')

segments = [
Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in whisper_segments]
segments_json = json.dumps(
segments, ensure_ascii=True, default=vars)
sys.stderr.write(f'segments = {segments_json}\n')
sys.stderr.write(
WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + '\n')


def write_output(path: str, segments: List[Segment], output_format: OutputFormat):
logging.debug(
'Writing transcription output, path = %s, output format = %s, number of segments = %s', path, output_format,
Expand Down Expand Up @@ -681,9 +737,12 @@ def run(self):
task=self.current_task)
elif model_type == ModelType.OPEN_AI_WHISPER_API:
self.current_transcriber = OpenAIWhisperAPIFileTranscriber(task=self.current_task)
elif model_type == ModelType.HUGGING_FACE or \
model_type == ModelType.WHISPER or \
model_type == ModelType.FASTER_WHISPER:
self.current_transcriber = WhisperFileTranscriber(task=self.current_task)
else:
self.current_transcriber = WhisperFileTranscriber(
task=self.current_task)
raise Exception(f'Unknown model type: {model_type}')

self.current_transcriber_thread = QThread(self)

Expand Down
Loading

0 comments on commit 86626aa

Please sign in to comment.