Skip to content

Commit

Permalink
Will print progress while processing with Huggingface (#870)
Browse files Browse the repository at this point in the history
  • Loading branch information
raivisdejus authored Aug 3, 2024
1 parent db6d8b7 commit 326ffbc
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 8 deletions.
7 changes: 6 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,12 @@ pip install $whlFile

#### GPU Support

To enable GPU support first ensure CUDA 12.1 is installed - https://developer.nvidia.com/cuda-12-1-0-download-archive
GPU support on Windows is possible for Buzz that ir installed from the source code or with `pip`.
Use the instructions above to install the Buzz from the source code or run `pip install buzz-captions`
and then follow the instructions below to enable CUDA GPU support.

To enable GPU support first ensure CUDA 12.1 is installed - https://developer.nvidia.com/cuda-12-1-0-download-archive
Other versions of CUDA 12 should also work.

Switch torch library to GPU version. It must match the CUDA version installed, see https://pytorch.org/get-started/locally/ .
```
Expand Down
1 change: 0 additions & 1 deletion buzz/transcriber/whisper_file_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ def transcribe_whisper(
) -> None:
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
# TODO Find a way to emmit real progress
sys.stderr.write("0%\n")
segments = cls.transcribe_hugging_face(task)
sys.stderr.write("100%\n")
Expand Down
149 changes: 147 additions & 2 deletions buzz/transformers_whisper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,152 @@
import os
import sys
import numpy as np
import torch
import requests
from typing import Optional, Union
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers.pipelines import AutomaticSpeechRecognitionPipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from transformers.pipelines.automatic_speech_recognition import is_torchaudio_available


class PipelineWithProgress(AutomaticSpeechRecognitionPipeline): # pragma: no cover
# Copy of transformers `AutomaticSpeechRecognitionPipeline.chunk_iter` method with custom progress output
@staticmethod
def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
inputs_len = inputs.shape[0]
step = chunk_len - stride_left - stride_right
for chunk_start_idx in range(0, inputs_len, step):
# Print progress to stderr
progress = int((chunk_start_idx / inputs_len) * 100)
sys.stderr.write(f"{progress}%\n")

chunk_end_idx = chunk_start_idx + chunk_len
chunk = inputs[chunk_start_idx:chunk_end_idx]
processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
if dtype is not None:
processed = processed.to(dtype=dtype)
_stride_left = 0 if chunk_start_idx == 0 else stride_left
# all right strides must be full, otherwise it is the last item
is_last = chunk_end_idx > inputs_len if stride_right > 0 else chunk_end_idx >= inputs_len
_stride_right = 0 if is_last else stride_right

chunk_len = chunk.shape[0]
stride = (chunk_len, _stride_left, _stride_right)
if chunk.shape[0] > _stride_left:
yield {"is_last": is_last, "stride": stride, **processed}
if is_last:
break

# Copy of transformers `AutomaticSpeechRecognitionPipeline.preprocess` method with call to custom `chunk_iter`
def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
if isinstance(inputs, str):
if inputs.startswith("http://") or inputs.startswith("https://"):
# We need to actually check for a real protocol, otherwise it's impossible to use a local file
# like http_huggingface_co.png
inputs = requests.get(inputs).content
else:
with open(inputs, "rb") as f:
inputs = f.read()

if isinstance(inputs, bytes):
inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)

stride = None
extra = {}
if isinstance(inputs, dict):
stride = inputs.pop("stride", None)
# Accepting `"array"` which is the key defined in `datasets` for
# better integration
if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
raise ValueError(
"When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
'"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
"containing the sampling_rate associated with that array"
)

_inputs = inputs.pop("raw", None)
if _inputs is None:
# Remove path which will not be used from `datasets`.
inputs.pop("path", None)
_inputs = inputs.pop("array", None)
in_sampling_rate = inputs.pop("sampling_rate")
extra = inputs
inputs = _inputs
if in_sampling_rate != self.feature_extractor.sampling_rate:
if is_torchaudio_available():
from torchaudio import functional as F
else:
raise ImportError(
"torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
"The torchaudio package can be installed through: `pip install torchaudio`."
)

inputs = F.resample(
torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
).numpy()
ratio = self.feature_extractor.sampling_rate / in_sampling_rate
else:
ratio = 1
if stride is not None:
if stride[0] + stride[1] > inputs.shape[0]:
raise ValueError("Stride is too large for input")

# Stride needs to get the chunk length here, it's going to get
# swallowed by the `feature_extractor` later, and then batching
# can add extra data in the inputs, so we need to keep track
# of the original length in the stride so we can cut properly.
stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
if not isinstance(inputs, np.ndarray):
raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
if len(inputs.shape) != 1:
raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")

if chunk_length_s:
if stride_length_s is None:
stride_length_s = chunk_length_s / 6

if isinstance(stride_length_s, (int, float)):
stride_length_s = [stride_length_s, stride_length_s]

# XXX: Carefuly, this variable will not exist in `seq2seq` setting.
# Currently chunking is not possible at this level for `seq2seq` so
# it's ok.
align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)

if chunk_len < stride_left + stride_right:
raise ValueError("Chunk length must be superior to stride length")

# Will use our custom chunk_iter with progress
for item in self.chunk_iter(
inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype
):
yield item
else:
if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
processed = self.feature_extractor(
inputs,
sampling_rate=self.feature_extractor.sampling_rate,
truncation=False,
padding="longest",
return_tensors="pt",
)
else:
processed = self.feature_extractor(
inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
)

if self.torch_dtype is not None:
processed = processed.to(dtype=self.torch_dtype)
if stride is not None:
if self.type == "seq2seq":
raise ValueError("Stride is only usable with CTC models, try removing it !")

processed["stride"] = stride
yield {"is_last": True, **processed, **extra}


class TransformersWhisper:
Expand Down Expand Up @@ -34,6 +178,7 @@ def transcribe(

pipe = pipeline(
"automatic-speech-recognition",
pipeline_class=PipelineWithProgress,
generate_kwargs={"language": language, "task": task},
model=model,
tokenizer=processor.tokenizer,
Expand All @@ -50,8 +195,8 @@ def transcribe(
start, end = chunk['timestamp']
text = chunk['text']
segments.append({
"start": start,
"end": end,
"start": 0 if start is None else start,
"end": 0 if end is None else end,
"text": text,
"translation": ""
})
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,9 @@ sidebar_position: 5
Model size to use will depend on your hardware and use case. Smaller models will work faster but will have more inaccuracies. Larger models will be more accurate but will require more powerful hardware or longer time to transcribe.

When choosing among large models consider the following. "Large" is the first released older model, "Large-V2" is later updated model with better accuracy, for some languages considered the most robust and stable. "Large-V3" is the latest model with the best accuracy in many cases, but some times can hallucinate or invent words that were never in the audio. The only sure way to know what model best suits your needs is to test them all in your language.

4. **How to get GPU acceleration for faster transcription?**

On Linux GPU acceleration is supported out of the box on Nvidia GPUs with [CUDA installed](https://developer.nvidia.com/cuda-downloads).

On Windows see [this note](https://github.com/chidiwilliams/buzz/blob/main/CONTRIBUTING.md#gpu-support) on enabling CUDA GPU support.
8 changes: 4 additions & 4 deletions tests/transformers_whisper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from tests.audio import test_audio_path


@pytest.mark.skipif(
platform.system() == "Darwin",
reason="Not supported on Darwin",
)
class TestTransformersWhisper:
@pytest.mark.skipif(
platform.system() == "Darwin",
reason="Not supported on Darwin",
)
def test_should_transcribe(self):
model = TransformersWhisper("openai/whisper-tiny")
result = model.transcribe(
Expand Down

0 comments on commit 326ffbc

Please sign in to comment.