From eed2d38f4bd05eece12edc5ddd9a7ebb7042ec71 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 19 Jan 2024 10:35:57 +0100 Subject: [PATCH] Update speech recognition example --- .../run_speech_recognition_ctc.py | 28 +++++++++++-- .../run_speech_recognition_ctc.txt | 39 ++++++++++--------- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py index 911a2e9c4f..c97b5d97be 100644 --- a/examples/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/speech-recognition/run_speech_recognition_ctc.py @@ -139,6 +139,20 @@ class ModelArguments: ctc_loss_reduction: Optional[str] = field( default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} ) + ctc_zero_infinity: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly" + " occur when the inputs are too short to be aligned to the targets." + }, + ) + add_adapter: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very " + "useful to downsample the output length." + }, + ) @dataclass @@ -315,11 +329,14 @@ class DataCollatorCTCWithPadding: padding: Union[bool, str] = "longest" pad_to_multiple_of: Optional[int] = None pad_to_multiple_of_labels: Optional[int] = None + feature_extractor_input_name: Optional[str] = "input_values" def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: # split inputs and labels since they have to be of different lengths and need # different padding methods - input_features = [{"input_values": feature["input_values"]} for feature in features] + input_features = [ + {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features + ] label_features = [{"input_ids": feature["labels"]} for feature in features] batch = self.processor.pad( @@ -612,9 +629,11 @@ def remove_special_characters(batch): "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, + "ctc_zero_infinity": model_args.ctc_zero_infinity, "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, + "add_adapter": model_args.add_adapter, } ) @@ -653,6 +672,7 @@ def remove_special_characters(batch): min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers + feature_extractor_input_name = feature_extractor.model_input_names[0] # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language @@ -664,8 +684,9 @@ def prepare_dataset(batch): sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) - batch["input_values"] = inputs.input_values[0] - batch["input_length"] = len(batch["input_values"]) + batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0] + # take length of raw audio waveform + batch["input_length"] = len(sample["array"].squeeze()) # encode targets additional_kwargs = {} @@ -748,6 +769,7 @@ def compute_metrics(pred): # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding( processor=processor, + feature_extractor_input_name=feature_extractor_input_name, pad_to_multiple_of=int(max_input_length), pad_to_multiple_of_labels=500, ) diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt index 689b9a2691..ed35e25426 100644 --- a/tests/example_diff/run_speech_recognition_ctc.txt +++ b/tests/example_diff/run_speech_recognition_ctc.txt @@ -30,33 +30,37 @@ > > require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") > -141d147 +145c152 +< "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very" +--- +> "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very " +155d161 < -251c257 +265c271 < "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" --- > "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " -390c396 +407c413 < parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) --- > parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments)) -433a440,445 +450a457,462 > gaudi_config = GaudiConfig.from_pretrained( > training_args.gaudi_config_name, > cache_dir=model_args.cache_dir, > use_auth_token=True if data_args.use_auth_token else None, > ) > -434a447 +451a464 > mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast -436,437c449,451 +453,454c466,468 < f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " < f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" --- > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -450,456c464,469 +467,473c481,486 < if training_args.do_train: < raw_datasets["train"] = load_dataset( < data_args.dataset_name, @@ -71,7 +75,7 @@ > split=data_args.train_split_name, > token=data_args.token, > ) -458,463c471,476 +475,480c488,493 < if data_args.audio_column_name not in raw_datasets["train"].column_names: < raise ValueError( < f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." @@ -85,7 +89,7 @@ > " Make sure to set `--audio_column_name` to the correct audio column - one of" > f" {', '.join(raw_datasets['train'].column_names)}." > ) -465,470c478,483 +482,487c495,500 < if data_args.text_column_name not in raw_datasets["train"].column_names: < raise ValueError( < f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " @@ -99,33 +103,32 @@ > "Make sure to set `--text_column_name` to the correct text column - one of " > f"{', '.join(raw_datasets['train'].column_names)}." > ) -472,473c485,486 +489,490c502,503 < if data_args.max_train_samples is not None: < raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) --- > if data_args.max_train_samples is not None: > raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) -491c504 +508c521 < f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None --- > f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None -628a642,646 +647a661,665 > raise RuntimeError( > f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one" > f" ({feature_extractor.sampling_rate}).Data resampling should be done. The Datasets library does not" > " support it on HPUs yet." > ) -731c749,753 -< data_collator = DataCollatorCTCWithPadding(processor=processor) +753c771,774 +< processor=processor, feature_extractor_input_name=feature_extractor_input_name --- -> data_collator = DataCollatorCTCWithPadding( > processor=processor, +> feature_extractor_input_name=feature_extractor_input_name, > pad_to_multiple_of=int(max_input_length), > pad_to_multiple_of_labels=500, -> ) -734c756 +757c778 < trainer = Trainer( --- > trainer = GaudiTrainer( -735a758 +758a780 > gaudi_config=gaudi_config,