diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index 40ed561adda0..a21214cc47c6 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -133,30 +133,7 @@ For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] m
 >>> # Not using whisper, as it cannot provide timestamps.
 >>> generator = pipeline(model="facebook/wav2vec2-large-960h-lv60-self", return_timestamps="word")
 >>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP AND LIVE OUT THE TRUE MEANING OF ITS CREED',
- 'chunks': [
-      {'text': 'I', 'timestamp': (1.22, 1.24)},
-      {'text': 'HAVE', 'timestamp': (1.42, 1.58)},
-      {'text': 'A', 'timestamp': (1.66, 1.68)},
-      {'text': 'DREAM', 'timestamp': (1.76, 2.14)},
-      {'text': 'BUT', 'timestamp': (3.68, 3.8)},
-      {'text': 'ONE', 'timestamp': (3.94, 4.06)},
-      {'text': 'DAY', 'timestamp': (4.16, 4.3)},
-      {'text': 'THIS', 'timestamp': (6.36, 6.54)},
-      {'text': 'NATION', 'timestamp': (6.68, 7.1)},
-      {'text': 'WILL', 'timestamp': (7.32, 7.56)},
-      {'text': 'RISE', 'timestamp': (7.8, 8.26)},
-      {'text': 'UP', 'timestamp': (8.38, 8.48)},
-      {'text': 'AND', 'timestamp': (10.08, 10.18)},
-      {'text': 'LIVE', 'timestamp': (10.26, 10.48)},
-      {'text': 'OUT', 'timestamp': (10.58, 10.7)},
-      {'text': 'THE', 'timestamp': (10.82, 10.9)},
-      {'text': 'TRUE', 'timestamp': (10.98, 11.18)},
-      {'text': 'MEANING', 'timestamp': (11.26, 11.58)},
-      {'text': 'OF', 'timestamp': (11.66, 11.7)},
-      {'text': 'ITS', 'timestamp': (11.76, 11.88)},
-      {'text': 'CREED', 'timestamp': (12.0, 12.38)}
-]}
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP AND LIVE OUT THE TRUE MEANING OF ITS CREED', 'chunks': [{'text': 'I', 'timestamp': (1.22, 1.24)}, {'text': 'HAVE', 'timestamp': (1.42, 1.58)}, {'text': 'A', 'timestamp': (1.66, 1.68)}, {'text': 'DREAM', 'timestamp': (1.76, 2.14)}, {'text': 'BUT', 'timestamp': (3.68, 3.8)}, {'text': 'ONE', 'timestamp': (3.94, 4.06)}, {'text': 'DAY', 'timestamp': (4.16, 4.3)}, {'text': 'THIS', 'timestamp': (6.36, 6.54)}, {'text': 'NATION', 'timestamp': (6.68, 7.1)}, {'text': 'WILL', 'timestamp': (7.32, 7.56)}, {'text': 'RISE', 'timestamp': (7.8, 8.26)}, {'text': 'UP', 'timestamp': (8.38, 8.48)}, {'text': 'AND', 'timestamp': (10.08, 10.18)}, {'text': 'LIVE', 'timestamp': (10.26, 10.48)}, {'text': 'OUT', 'timestamp': (10.58, 10.7)}, {'text': 'THE', 'timestamp': (10.82, 10.9)}, {'text': 'TRUE', 'timestamp': (10.98, 11.18)}, {'text': 'MEANING', 'timestamp': (11.26, 11.58)}, {'text': 'OF', 'timestamp': (11.66, 11.7)}, {'text': 'ITS', 'timestamp': (11.76, 11.88)}, {'text': 'CREED', 'timestamp': (12.0, 12.38)}]}
 ```
 
 As you can see, the model inferred the text and also outputted **when** the various words were pronounced
@@ -250,9 +227,7 @@ Using a [`pipeline`] for NLP tasks is practically identical.
 ...     "I have a problem with my iphone that needs to be resolved asap!!",
 ...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
 ... )
-{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!',
- 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
- 'scores': [0.504,0.479,0.013,0.003,0.002]}
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
 ```
 
 ### Multimodal pipeline
@@ -269,5 +244,5 @@ For example, if you use this [invoice image](https://huggingface.co/spaces/impir
 ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
 ...     question="What is the invoice number?",
 ... )
-[{'score': 0.635722279548645, 'answer': '1110212019', 'start': 22, 'end': 22}]
+[{'score': 0.42514941096305847, 'answer': 'us-001', 'start': 16, 'end': 16}]
 ```