lhotse-speech · pzelasko · Jan 24, 2023 · Jan 23, 2023 · Jan 23, 2023 · Jan 24, 2023
diff --git a/lhotse/workflows/whisper.py b/lhotse/workflows/whisper.py
@@ -17,10 +17,11 @@
 
 def annotate_with_whisper(
     manifest: Union[RecordingSet, CutSet],
-    language: Optional[str] = None,
     model_name: str = "base",
     device: str = "cpu",
     force_nonoverlapping: bool = False,
+    download_root: Optional[str] = None,
+    **decode_options,
 ) -> Generator[MonoCut, None, None]:
     """
     Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
@@ -48,29 +49,30 @@ def annotate_with_whisper(
 
     if isinstance(manifest, RecordingSet):
         yield from _annotate_recordings(
-            manifest, language, model_name, device, force_nonoverlapping
+            manifest, model_name, device, force_nonoverlapping, download_root, **decode_options
         )
     elif isinstance(manifest, CutSet):
         yield from _annotate_cuts(
-            manifest, language, model_name, device, force_nonoverlapping
+            manifest, model_name, device, force_nonoverlapping, download_root, **decode_options
         )
     else:
         raise ValueError("The ``manifest`` must be either a RecordingSet or a CutSet.")
 
 
 def _annotate_recordings(
     recordings: RecordingSet,
-    language: str,
     model_name: str,
     device: str,
     force_nonoverlapping: bool,
+    download_root: Optional[str] = None,
+    **decode_options,
 ):
     """
     Helper function that annotates a RecordingSet with Whisper.
     """
     import whisper
 
-    model = whisper.load_model(model_name, device=device)
+    model = whisper.load_model(model_name, device=device, download_root=download_root)
 
     for recording in recordings:
         if recording.num_channels > 1:
@@ -80,7 +82,7 @@ def _annotate_recordings(
             )
             continue
         audio = torch.from_numpy(recording.resample(16000).load_audio()).squeeze(0)
-        result = whisper.transcribe(model=model, audio=audio, language=language)
+        result = whisper.transcribe(model=model, audio=audio, **decode_options)
         # Create supervisions from segments while filtering out those with negative duration.
         supervisions = [
             SupervisionSegment(
@@ -113,17 +115,18 @@ def _annotate_recordings(
 
 def _annotate_cuts(
     cuts: CutSet,
-    language: str,
     model_name: str,
     device: str,
     force_nonoverlapping: bool,
+    download_root: Optional[str] = None,
+    **decode_options,
 ):
     """
     Helper function that annotates a CutSet with Whisper.
     """
     import whisper
 
-    model = whisper.load_model(model_name, device=device)
+    model = whisper.load_model(model_name, device=device, download_root=download_root)
 
     for cut in cuts:
         if cut.num_channels > 1:
@@ -133,7 +136,7 @@ def _annotate_cuts(
             )
             continue
         audio = torch.from_numpy(cut.resample(16000).load_audio()).squeeze(0)
-        result = whisper.transcribe(model=model, audio=audio, language=language)
+        result = whisper.transcribe(model=model, audio=audio, **decode_options)
         # Create supervisions from segments while filtering out those with negative duration.
         supervisions = [
             SupervisionSegment(