Add Faster Whisper (#400)

chidiwilliams · Apr 10, 2023 · 86626aa · 86626aa
1 parent b807302
commit 86626aa
Show file tree

Hide file tree

Showing 7 changed files with 419 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
 [![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)
 
 <blockquote>
-<p>Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more for $9.99.</p>
+<p>Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more.</p>
 <a href="https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://tools.applemediaservices.com/api/badges/download-on-the-mac-app-store/black/en-us?size=250x83&amp;releaseDate=1679529600" alt="Download on the Mac App Store" /></a>
 </blockquote> 
 
@@ -23,7 +23,7 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
 - Import audio and video files and export transcripts to TXT, SRT, and
   VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
 - Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
-  [Whisper.cpp](https://github.com/ggerganov/whisper.cpp),
+  [Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
   [Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
   the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
 - Available on Mac, Windows, and Linux

diff --git a/buzz/gui.py b/buzz/gui.py
@@ -188,6 +188,11 @@ class DownloadModelProgressDialog(QProgressDialog):
     def __init__(self, parent: Optional[QWidget], *args) -> None:
         super().__init__(_('Downloading model (0%, unknown time remaining)'),
                          _('Cancel'), 0, 100, parent, *args)
+
+        # Setting this to a high value to avoid showing the dialog for models that
+        # are checked locally but set progress to 0 immediately, i.e. Hugging Face or Faster Whisper models
+        self.setMinimumDuration(10_000)
+
         self.setWindowModality(Qt.WindowModality.ApplicationModal)
         self.start_time = datetime.now()
         self.setFixedSize(self.size())
@@ -1373,7 +1378,8 @@ def reset_visible_rows(self):
         model_type = self.transcription_options.model.model_type
         self.form_layout.setRowVisible(self.hugging_face_search_line_edit, model_type == ModelType.HUGGING_FACE)
         self.form_layout.setRowVisible(self.whisper_model_size_combo_box,
-                                       (model_type == ModelType.WHISPER) or (model_type == ModelType.WHISPER_CPP))
+                                       (model_type == ModelType.WHISPER) or (model_type == ModelType.WHISPER_CPP) or (
+                                               model_type == ModelType.FASTER_WHISPER))
         self.form_layout.setRowVisible(self.openai_access_token_edit, model_type == ModelType.OPEN_AI_WHISPER_API)
 
     def on_model_type_changed(self, text: str):

diff --git a/buzz/model_loader.py b/buzz/model_loader.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
+import faster_whisper
 import requests
 import whisper
 from PyQt6.QtCore import QObject, pyqtSignal, pyqtSlot
@@ -26,6 +27,7 @@ class ModelType(enum.Enum):
     WHISPER = 'Whisper'
     WHISPER_CPP = 'Whisper.cpp'
     HUGGING_FACE = 'Hugging Face'
+    FASTER_WHISPER = 'Faster Whisper'
     OPEN_AI_WHISPER_API = 'OpenAI Whisper API'
 
 
@@ -99,6 +101,11 @@ def run(self):
         elif self.model_type == ModelType.OPEN_AI_WHISPER_API:
             file_path = ""
 
+        elif self.model_type == ModelType.FASTER_WHISPER:
+            self.progress.emit((0, 100))
+            file_path = faster_whisper.download_model(size=self.whisper_model_size.value)
+            self.progress.emit((100, 100))
+
         else:
             raise Exception("Invalid model type: " + self.model_type.value)
 

diff --git a/buzz/transcriber.py b/buzz/transcriber.py
@@ -18,12 +18,15 @@
 from random import randint
 from threading import Thread
 from typing import Any, List, Optional, Tuple, Union, Set
+
+import faster_whisper
 import openai
 
 import ffmpeg
 import numpy as np
 import sounddevice
 import stable_whisper
+import tqdm
 import whisper
 from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread
 from sounddevice import PortAudioError
@@ -431,7 +434,7 @@ def transcribe(self) -> List[Segment]:
 
         recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False)
 
-        self.current_process = multiprocessing.Process(target=transcribe_whisper,
+        self.current_process = multiprocessing.Process(target=self.transcribe_whisper,
                                                        args=(send_pipe, self.transcription_task))
         if not self.stopped:
             self.current_process.start()
@@ -457,6 +460,97 @@ def transcribe(self) -> List[Segment]:
 
         return self.segments
 
+    @classmethod
+    def transcribe_whisper(cls, stderr_conn: Connection, task: FileTranscriptionTask) -> None:
+        with pipe_stderr(stderr_conn):
+            if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
+                segments = cls.transcribe_hugging_face(task)
+            elif task.transcription_options.model.model_type == ModelType.FASTER_WHISPER:
+                segments = cls.transcribe_faster_whisper(task)
+            elif task.transcription_options.model.model_type == ModelType.WHISPER:
+                segments = cls.transcribe_openai_whisper(task)
+            else:
+                raise Exception(f"Invalid model type: {task.transcription_options.model.model_type}")
+
+            segments_json = json.dumps(
+                segments, ensure_ascii=True, default=vars)
+            sys.stderr.write(f'segments = {segments_json}\n')
+            sys.stderr.write(
+                WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + '\n')
+
+    @classmethod
+    def transcribe_hugging_face(cls, task: FileTranscriptionTask) -> List[Segment]:
+        model = transformers_whisper.load_model(task.model_path)
+        language = task.transcription_options.language if task.transcription_options.language is not None else 'en'
+        result = model.transcribe(audio=task.file_path, language=language,
+                                  task=task.transcription_options.task.value, verbose=False)
+        return [
+            Segment(
+                start=int(segment.get('start') * 1000),
+                end=int(segment.get('end') * 1000),
+                text=segment.get('text'),
+            ) for segment in result.get('segments')]
+
+    @classmethod
+    def transcribe_faster_whisper(cls, task: FileTranscriptionTask) -> List[Segment]:
+        model = faster_whisper.WhisperModel(
+            model_size_or_path=task.transcription_options.model.whisper_model_size.value)
+        whisper_segments, info = model.transcribe(audio=task.file_path,
+                                                  language=task.transcription_options.language,
+                                                  task=task.transcription_options.task.value,
+                                                  temperature=task.transcription_options.temperature,
+                                                  initial_prompt=task.transcription_options.initial_prompt,
+                                                  word_timestamps=task.transcription_options.word_level_timings)
+        segments = []
+        with tqdm.tqdm(total=round(info.duration, 2), unit=' seconds') as pbar:
+            for segment in list(whisper_segments):
+                # Segment will contain words if word-level timings is True
+                if segment.words:
+                    for word in segment.words:
+                        segments.append(Segment(
+                            start=int(word.start * 1000),
+                            end=int(word.end * 1000),
+                            text=word.word
+                        ))
+                else:
+                    segments.append(Segment(
+                        start=int(segment.start * 1000),
+                        end=int(segment.end * 1000),
+                        text=segment.text
+                    ))
+
+                pbar.update(segment.end - segment.start)
+        return segments
+
+    @classmethod
+    def transcribe_openai_whisper(cls, task: FileTranscriptionTask) -> List[Segment]:
+        model = whisper.load_model(task.model_path)
+
+        if task.transcription_options.word_level_timings:
+            stable_whisper.modify_model(model)
+            result = model.transcribe(
+                audio=task.file_path, language=task.transcription_options.language,
+                task=task.transcription_options.task.value, temperature=task.transcription_options.temperature,
+                initial_prompt=task.transcription_options.initial_prompt, pbar=True)
+            segments = stable_whisper.group_word_timestamps(result)
+            return [Segment(
+                start=int(segment.get('start') * 1000),
+                end=int(segment.get('end') * 1000),
+                text=segment.get('text'),
+            ) for segment in segments]
+
+        result = model.transcribe(
+            audio=task.file_path, language=task.transcription_options.language,
+            task=task.transcription_options.task.value,
+            temperature=task.transcription_options.temperature,
+            initial_prompt=task.transcription_options.initial_prompt, verbose=False)
+        segments = result.get('segments')
+        return [Segment(
+            start=int(segment.get('start') * 1000),
+            end=int(segment.get('end') * 1000),
+            text=segment.get('text'),
+        ) for segment in segments]
+
     def stop(self):
         self.stopped = True
         if self.started_process:
@@ -489,44 +583,6 @@ def read_line(self, pipe: Connection):
                     continue
 
 
-def transcribe_whisper(stderr_conn: Connection, task: FileTranscriptionTask):
-    with pipe_stderr(stderr_conn):
-        if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
-            model = transformers_whisper.load_model(task.model_path)
-            language = task.transcription_options.language if task.transcription_options.language is not None else 'en'
-            result = model.transcribe(audio=task.file_path, language=language,
-                                      task=task.transcription_options.task.value, verbose=False)
-            whisper_segments = result.get('segments')
-        else:
-            model = whisper.load_model(task.model_path)
-            if task.transcription_options.word_level_timings:
-                stable_whisper.modify_model(model)
-                result = model.transcribe(
-                    audio=task.file_path, language=task.transcription_options.language,
-                    task=task.transcription_options.task.value, temperature=task.transcription_options.temperature,
-                    initial_prompt=task.transcription_options.initial_prompt, pbar=True)
-                whisper_segments = stable_whisper.group_word_timestamps(result)
-            else:
-                result = model.transcribe(
-                    audio=task.file_path, language=task.transcription_options.language,
-                    task=task.transcription_options.task.value,
-                    temperature=task.transcription_options.temperature,
-                    initial_prompt=task.transcription_options.initial_prompt, verbose=False)
-                whisper_segments = result.get('segments')
-
-        segments = [
-            Segment(
-                start=int(segment.get('start') * 1000),
-                end=int(segment.get('end') * 1000),
-                text=segment.get('text'),
-            ) for segment in whisper_segments]
-        segments_json = json.dumps(
-            segments, ensure_ascii=True, default=vars)
-        sys.stderr.write(f'segments = {segments_json}\n')
-        sys.stderr.write(
-            WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + '\n')
-
-
 def write_output(path: str, segments: List[Segment], output_format: OutputFormat):
     logging.debug(
         'Writing transcription output, path = %s, output format = %s, number of segments = %s', path, output_format,
@@ -681,9 +737,12 @@ def run(self):
                 task=self.current_task)
         elif model_type == ModelType.OPEN_AI_WHISPER_API:
             self.current_transcriber = OpenAIWhisperAPIFileTranscriber(task=self.current_task)
+        elif model_type == ModelType.HUGGING_FACE or \
+                model_type == ModelType.WHISPER or \
+                model_type == ModelType.FASTER_WHISPER:
+            self.current_transcriber = WhisperFileTranscriber(task=self.current_task)
         else:
-            self.current_transcriber = WhisperFileTranscriber(
-                task=self.current_task)
+            raise Exception(f'Unknown model type: {model_type}')
 
         self.current_transcriber_thread = QThread(self)