From 95dec59d3f4c85c16f5945e9fac5b6a1d36e452a Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 5 Apr 2022 18:27:17 +0200 Subject: [PATCH 1/2] [Speech2Text Doc] Fix docs --- docs/source/en/model_doc/speech_to_text.mdx | 25 ++++----------------- utils/documentation_tests.txt | 1 + 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx index 2e86c497c057..7d1fc7626e00 100644 --- a/docs/source/en/model_doc/speech_to_text.mdx +++ b/docs/source/en/model_doc/speech_to_text.mdx @@ -47,23 +47,15 @@ be installed as follows: `apt install libsndfile1-dev` >>> import torch >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration >>> from datasets import load_dataset ->>> import soundfile as sf >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") ->>> def map_to_array(batch): -... speech, _ = sf.read(batch["file"]) -... batch["speech"] = speech -... return batch - - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> ds = ds.map(map_to_array) ->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") ->>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"]) +>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") +>>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) >>> transcription = processor.batch_decode(generated_ids) ``` @@ -80,24 +72,15 @@ be installed as follows: `apt install libsndfile1-dev` >>> import torch >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration >>> from datasets import load_dataset ->>> import soundfile as sf >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") - ->>> def map_to_array(batch): -... speech, _ = sf.read(batch["file"]) -... batch["speech"] = speech -... return batch - - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ->>> ds = ds.map(map_to_array) ->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") +>>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") >>> generated_ids = model.generate( -... input_ids=inputs["input_features"], +... inputs["input_features"], ... attention_mask=inputs["attention_mask"], ... forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"], ... ) diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index f88974ed434e..115026b2da1f 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -1,5 +1,6 @@ docs/source/en/quicktour.mdx docs/source/en/task_summary.mdx +docs/source/en/model_doc/speech_to_text.mdx src/transformers/generation_utils.py src/transformers/models/bart/modeling_bart.py src/transformers/models/beit/modeling_beit.py From 42f6394ec377eee92f8a80322153a0fb23ba2b48 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 6 Apr 2022 12:13:10 +0200 Subject: [PATCH 2/2] apply ydshiehs suggestions --- docs/source/en/model_doc/speech_to_text.mdx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx index 7d1fc7626e00..0a3b00b1d5dd 100644 --- a/docs/source/en/model_doc/speech_to_text.mdx +++ b/docs/source/en/model_doc/speech_to_text.mdx @@ -52,12 +52,14 @@ be installed as follows: `apt install libsndfile1-dev` >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") >>> generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) >>> transcription = processor.batch_decode(generated_ids) +>>> transcription +['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'] ``` - Multilingual speech translation @@ -76,7 +78,7 @@ be installed as follows: `apt install libsndfile1-dev` >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") ->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") >>> generated_ids = model.generate( @@ -86,6 +88,8 @@ be installed as follows: `apt install libsndfile1-dev` ... ) >>> translation = processor.batch_decode(generated_ids) +>>> translation +[" (Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."] ``` See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look for Speech2Text checkpoints.