From 26be62ca0ce162244001e69244d68f0570344623 Mon Sep 17 00:00:00 2001 From: Raivis Dejus Date: Tue, 18 Jun 2024 22:48:06 +0300 Subject: [PATCH 1/3] Adding large v2 and v3 models --- buzz/model_loader.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/buzz/model_loader.py b/buzz/model_loader.py index 70031a547f..7a7023a396 100644 --- a/buzz/model_loader.py +++ b/buzz/model_loader.py @@ -43,10 +43,17 @@ class WhisperModelSize(str, enum.Enum): SMALL = "small" MEDIUM = "medium" LARGE = "large" + LARGEV2 = "large-v2" + LARGEV3 = "large-v3" def to_faster_whisper_model_size(self) -> str: if self == WhisperModelSize.LARGE: - return "large-v2" + return "large-v1" + return self.value + + def to_whisper_cpp_model_size(self) -> str: + if self == WhisperModelSize.LARGE: + return "large-v1" return self.value def __str__(self): @@ -201,7 +208,9 @@ def get_local_model_path(self) -> Optional[str]: "base": "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe", "small": "1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b", "medium": "6c14d5adee5f86394037b4e4e8b59f1673b6cee10e3cf0b11bbdbee79c156208", - "large": "64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2", + "large-v1": "7d99f41a10525d0206bddadd86760181fa920438b6b33237e3118ff6c83bb53d", + "large-v2": "9a423fe4d40c82774b6af34115b8b935f34152246eb19e80e376071d3f999487", + "large-v3": "64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2", } @@ -318,7 +327,12 @@ def download_faster_whisper_model( % (size, ", ".join(faster_whisper.utils._MODELS)) ) - repo_id = "guillaumekln/faster-whisper-%s" % size + logging.debug("Downloading Faster Whisper model: %s", size) + + if size == WhisperModelSize.LARGEV3: + repo_id = "Systran/faster-whisper-large-v3" + else: + repo_id = "guillaumekln/faster-whisper-%s" % size allow_patterns = [ "model.bin", # largest by size first @@ -357,7 +371,7 @@ def __init__(self, model: TranscriptionModel): def run(self) -> None: if self.model.model_type == ModelType.WHISPER_CPP: - model_name = self.model.whisper_model_size.value + model_name = self.model.whisper_model_size.value.to_whisper_cpp_model_size() url = huggingface_hub.hf_hub_url( repo_id="ggerganov/whisper.cpp", filename=f"ggml-{model_name}.bin", From 90f73478d551fd8c7157498e48eb94bcfd604340 Mon Sep 17 00:00:00 2001 From: Raivis Dejus Date: Tue, 18 Jun 2024 22:58:09 +0300 Subject: [PATCH 2/3] Adding notes on model selection --- docs/docs/faq.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/docs/faq.md b/docs/docs/faq.md index 0cb5a12c31..05cb61926a 100644 --- a/docs/docs/faq.md +++ b/docs/docs/faq.md @@ -19,4 +19,10 @@ sidebar_position: 5 Relevant tools: - Mac OS - [BlackHole](https://github.com/ExistentialAudio/BlackHole). - Windows - [VB CABLE](https://vb-audio.com/Cable/) - - Linux - [PulseAudio Volume Control](https://wiki.ubuntu.com/record_system_sound) \ No newline at end of file + - Linux - [PulseAudio Volume Control](https://wiki.ubuntu.com/record_system_sound) + +4. **What model should I use?** + + Model size to use will depend on your hardware and use case. Smaller models will work faster but will have more inaccuracies. Larger models will be more accurate but will require more powerful hardware or longer time to transcribe. + + When choosing among large models consider the following. "Large" is the first released older model, "Large-V2" is later updated model with better accuracy, for some languages considered the most robust and stable. "Large-V3" is the latest model with the best accuracy in many cases, but some times can hallucinate or invent words that were never in the audio. The only sure way to know what model best suits your needs is to test them all in your language. From 7510558958ef796951aa3e64292045332c2bf669 Mon Sep 17 00:00:00 2001 From: Raivis Dejus Date: Tue, 18 Jun 2024 23:14:28 +0300 Subject: [PATCH 3/3] Fix for whisper cpp model size conversion --- buzz/model_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buzz/model_loader.py b/buzz/model_loader.py index 7a7023a396..c5e2742bbe 100644 --- a/buzz/model_loader.py +++ b/buzz/model_loader.py @@ -371,7 +371,7 @@ def __init__(self, model: TranscriptionModel): def run(self) -> None: if self.model.model_type == ModelType.WHISPER_CPP: - model_name = self.model.whisper_model_size.value.to_whisper_cpp_model_size() + model_name = self.model.whisper_model_size.to_whisper_cpp_model_size() url = huggingface_hub.hf_hub_url( repo_id="ggerganov/whisper.cpp", filename=f"ggml-{model_name}.bin",