diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index b826eb048ed9..37b0af0a0e12 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -603,7 +603,7 @@ def decode( >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") >>> # load first sample of English common_voice - >>> dataset = load_dataset("common_voice", "en", split="train", streaming=True) + >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True) >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) >>> dataset_iter = iter(dataset) >>> sample = next(dataset_iter) @@ -626,10 +626,10 @@ def decode( ... } ... for d in outputs.word_offsets ... ] - >>> # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer: - >>> # https://huggingface.co/datasets/common_voice/viewer/en/train + >>> # compare word offsets with audio `en_train_0/common_voice_en_19121553.mp3` online on the dataset viewer: + >>> # https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/en >>> word_offsets[:3] - [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.64, 'end_time': 1.9}, {'word': 'MILISANDRA', 'start_time': 2.26, 'end_time': 2.9}] + [{'word': 'THE', 'start_time': 0.7, 'end_time': 0.78}, {'word': 'TRICK', 'start_time': 0.88, 'end_time': 1.08}, {'word': 'APPEARS', 'start_time': 1.2, 'end_time': 1.64}] ```""" # Convert inputs to python lists token_ids = to_py_obj(token_ids) diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py index e331da14e810..fdc06a806880 100644 --- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -538,7 +538,7 @@ def decode( >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") >>> # load first sample of English common_voice - >>> dataset = load_dataset("common_voice", "en", split="train", streaming=True) + >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True) >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) >>> dataset_iter = iter(dataset) >>> sample = next(dataset_iter) @@ -561,10 +561,10 @@ def decode( ... } ... for d in outputs.word_offsets ... ] - >>> # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer: - >>> # https://huggingface.co/datasets/common_voice/viewer/en/train + >>> # compare word offsets with audio `en_train_0/common_voice_en_19121553.mp3` online on the dataset viewer: + >>> # https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/en >>> word_offsets[:4] - [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.66, 'end_time': 1.9}, {'word': 'MILISANDRA', 'start_time': 2.26, 'end_time': 2.9}, {'word': 'LOOK', 'start_time': 3.0, 'end_time': 3.16}] + [{'word': 'THE', 'start_time': 0.68, 'end_time': 0.78}, {'word': 'TRACK', 'start_time': 0.88, 'end_time': 1.1}, {'word': 'APPEARS', 'start_time': 1.18, 'end_time': 1.66}, {'word': 'ON', 'start_time': 1.86, 'end_time': 1.92}] ```""" from pyctcdecode.constants import (