diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx
index 2e86c497c057..0a3b00b1d5dd 100644
--- a/docs/source/en/model_doc/speech_to_text.mdx
+++ b/docs/source/en/model_doc/speech_to_text.mdx
@@ -47,25 +47,19 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> import torch
 >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
 >>> from datasets import load_dataset
->>> import soundfile as sf
 
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
 
 
->>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
-...     return batch
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
 
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> ds = ds.map(map_to_array)
-
->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
+>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
+>>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])
 
 >>> transcription = processor.batch_decode(generated_ids)
+>>> transcription
+['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel']
 ```
 
 - Multilingual speech translation
@@ -80,29 +74,22 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> import torch
 >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
 >>> from datasets import load_dataset
->>> import soundfile as sf
 
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
 
->>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
-...     return batch
-
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> ds = ds.map(map_to_array)
-
->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
 >>> generated_ids = model.generate(
-...     input_ids=inputs["input_features"],
+...     inputs["input_features"],
 ...     attention_mask=inputs["attention_mask"],
 ...     forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
 ... )
 
 >>> translation = processor.batch_decode(generated_ids)
+>>> translation
+["<lang:fr> (Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."]
 ```
 
 See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index f88974ed434e..115026b2da1f 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -1,5 +1,6 @@
 docs/source/en/quicktour.mdx
 docs/source/en/task_summary.mdx
+docs/source/en/model_doc/speech_to_text.mdx
 src/transformers/generation_utils.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/beit/modeling_beit.py