Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Faster Whisper #400

Merged
merged 9 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)

<blockquote>
<p>Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more for $9.99.</p>
<p>Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more.</p>
<a href="https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://tools.applemediaservices.com/api/badges/download-on-the-mac-app-store/black/en-us?size=250x83&amp;releaseDate=1679529600" alt="Download on the Mac App Store" /></a>
</blockquote>

Expand All @@ -23,7 +23,7 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
- Import audio and video files and export transcripts to TXT, SRT, and
VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
- Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
[Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
- Available on Mac, Windows, and Linux
Expand Down
8 changes: 7 additions & 1 deletion buzz/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ class DownloadModelProgressDialog(QProgressDialog):
def __init__(self, parent: Optional[QWidget], *args) -> None:
super().__init__(_('Downloading model (0%, unknown time remaining)'),
_('Cancel'), 0, 100, parent, *args)

# Setting this to a high value to avoid showing the dialog for models that
# are checked locally but set progress to 0 immediately, i.e. Hugging Face or Faster Whisper models
self.setMinimumDuration(10_000)

self.setWindowModality(Qt.WindowModality.ApplicationModal)
self.start_time = datetime.now()
self.setFixedSize(self.size())
Expand Down Expand Up @@ -1373,7 +1378,8 @@ def reset_visible_rows(self):
model_type = self.transcription_options.model.model_type
self.form_layout.setRowVisible(self.hugging_face_search_line_edit, model_type == ModelType.HUGGING_FACE)
self.form_layout.setRowVisible(self.whisper_model_size_combo_box,
(model_type == ModelType.WHISPER) or (model_type == ModelType.WHISPER_CPP))
(model_type == ModelType.WHISPER) or (model_type == ModelType.WHISPER_CPP) or (
model_type == ModelType.FASTER_WHISPER))
self.form_layout.setRowVisible(self.openai_access_token_edit, model_type == ModelType.OPEN_AI_WHISPER_API)

def on_model_type_changed(self, text: str):
Expand Down
7 changes: 7 additions & 0 deletions buzz/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dataclasses import dataclass
from typing import Optional

import faster_whisper
import requests
import whisper
from PyQt6.QtCore import QObject, pyqtSignal, pyqtSlot
Expand All @@ -26,6 +27,7 @@ class ModelType(enum.Enum):
WHISPER = 'Whisper'
WHISPER_CPP = 'Whisper.cpp'
HUGGING_FACE = 'Hugging Face'
FASTER_WHISPER = 'Faster Whisper'
OPEN_AI_WHISPER_API = 'OpenAI Whisper API'


Expand Down Expand Up @@ -99,6 +101,11 @@ def run(self):
elif self.model_type == ModelType.OPEN_AI_WHISPER_API:
file_path = ""

elif self.model_type == ModelType.FASTER_WHISPER:
self.progress.emit((0, 100))
file_path = faster_whisper.download_model(size=self.whisper_model_size.value)
self.progress.emit((100, 100))

else:
raise Exception("Invalid model type: " + self.model_type.value)

Expand Down
141 changes: 100 additions & 41 deletions buzz/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
from random import randint
from threading import Thread
from typing import Any, List, Optional, Tuple, Union, Set

import faster_whisper
import openai

import ffmpeg
import numpy as np
import sounddevice
import stable_whisper
import tqdm
import whisper
from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread
from sounddevice import PortAudioError
Expand Down Expand Up @@ -431,7 +434,7 @@ def transcribe(self) -> List[Segment]:

recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)

self.current_process = multiprocessing.Process(target=transcribe_whisper,
self.current_process = multiprocessing.Process(target=self.transcribe_whisper,
args=(send_pipe, self.transcription_task))
if not self.stopped:
self.current_process.start()
Expand All @@ -457,6 +460,97 @@ def transcribe(self) -> List[Segment]:

return self.segments

@classmethod
def transcribe_whisper(cls, stderr_conn: Connection, task: FileTranscriptionTask) -> None:
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
segments = cls.transcribe_hugging_face(task)
elif task.transcription_options.model.model_type == ModelType.FASTER_WHISPER:
segments = cls.transcribe_faster_whisper(task)
elif task.transcription_options.model.model_type == ModelType.WHISPER:
segments = cls.transcribe_openai_whisper(task)
else:
raise Exception(f"Invalid model type: {task.transcription_options.model.model_type}")

segments_json = json.dumps(
segments, ensure_ascii=True, default=vars)
sys.stderr.write(f'segments = {segments_json}\n')
sys.stderr.write(
WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + '\n')

@classmethod
def transcribe_hugging_face(cls, task: FileTranscriptionTask) -> List[Segment]:
model = transformers_whisper.load_model(task.model_path)
language = task.transcription_options.language if task.transcription_options.language is not None else 'en'
result = model.transcribe(audio=task.file_path, language=language,
task=task.transcription_options.task.value, verbose=False)
return [
Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in result.get('segments')]

@classmethod
def transcribe_faster_whisper(cls, task: FileTranscriptionTask) -> List[Segment]:
model = faster_whisper.WhisperModel(
model_size_or_path=task.transcription_options.model.whisper_model_size.value)
whisper_segments, info = model.transcribe(audio=task.file_path,
language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt,
word_timestamps=task.transcription_options.word_level_timings)
segments = []
with tqdm.tqdm(total=round(info.duration, 2), unit=' seconds') as pbar:
for segment in list(whisper_segments):
# Segment will contain words if word-level timings is True
if segment.words:
for word in segment.words:
segments.append(Segment(
start=int(word.start * 1000),
end=int(word.end * 1000),
text=word.word
))
else:
segments.append(Segment(
start=int(segment.start * 1000),
end=int(segment.end * 1000),
text=segment.text
))

pbar.update(segment.end - segment.start)
return segments

@classmethod
def transcribe_openai_whisper(cls, task: FileTranscriptionTask) -> List[Segment]:
model = whisper.load_model(task.model_path)

if task.transcription_options.word_level_timings:
stable_whisper.modify_model(model)
result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value, temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, pbar=True)
segments = stable_whisper.group_word_timestamps(result)
return [Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in segments]

result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, verbose=False)
segments = result.get('segments')
return [Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in segments]

def stop(self):
self.stopped = True
if self.started_process:
Expand Down Expand Up @@ -489,44 +583,6 @@ def read_line(self, pipe: Connection):
continue


def transcribe_whisper(stderr_conn: Connection, task: FileTranscriptionTask):
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
model = transformers_whisper.load_model(task.model_path)
language = task.transcription_options.language if task.transcription_options.language is not None else 'en'
result = model.transcribe(audio=task.file_path, language=language,
task=task.transcription_options.task.value, verbose=False)
whisper_segments = result.get('segments')
else:
model = whisper.load_model(task.model_path)
if task.transcription_options.word_level_timings:
stable_whisper.modify_model(model)
result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value, temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, pbar=True)
whisper_segments = stable_whisper.group_word_timestamps(result)
else:
result = model.transcribe(
audio=task.file_path, language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt, verbose=False)
whisper_segments = result.get('segments')

segments = [
Segment(
start=int(segment.get('start') * 1000),
end=int(segment.get('end') * 1000),
text=segment.get('text'),
) for segment in whisper_segments]
segments_json = json.dumps(
segments, ensure_ascii=True, default=vars)
sys.stderr.write(f'segments = {segments_json}\n')
sys.stderr.write(
WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + '\n')


def write_output(path: str, segments: List[Segment], output_format: OutputFormat):
logging.debug(
'Writing transcription output, path = %s, output format = %s, number of segments = %s', path, output_format,
Expand Down Expand Up @@ -681,9 +737,12 @@ def run(self):
task=self.current_task)
elif model_type == ModelType.OPEN_AI_WHISPER_API:
self.current_transcriber = OpenAIWhisperAPIFileTranscriber(task=self.current_task)
elif model_type == ModelType.HUGGING_FACE or \
model_type == ModelType.WHISPER or \
model_type == ModelType.FASTER_WHISPER:
self.current_transcriber = WhisperFileTranscriber(task=self.current_task)
else:
self.current_transcriber = WhisperFileTranscriber(
task=self.current_task)
raise Exception(f'Unknown model type: {model_type}')

self.current_transcriber_thread = QThread(self)

Expand Down
Loading