Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Add Speech Recognition Task (Wav2Vec) #586

Merged
merged 48 commits into from
Jul 19, 2021
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
dd92d79
Base files for wav2vec integration
Jul 14, 2021
2a43fe7
Format code with autopep8
deepsource-autofix[bot] Jul 14, 2021
6a39b34
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2021
1b48bc1
Closer to working
Jul 14, 2021
c87dcc2
Format code with autopep8
deepsource-autofix[bot] Jul 14, 2021
091da56
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 14, 2021
2690e9c
Refactors
Jul 15, 2021
1531560
Refactors
Jul 15, 2021
e8664d6
Cleanups
Jul 15, 2021
6d0f1c3
Refactor to allow files
Jul 15, 2021
a9735b2
Get predictions working
Jul 15, 2021
0901d12
Add licence
Jul 15, 2021
bce0e10
Merge branch 'master' into feat/speech_recognition
Jul 15, 2021
1f18f05
Fix loads
Jul 15, 2021
71cb06d
Add check
Jul 15, 2021
50642f5
Fix imports
Jul 15, 2021
d271951
Cleanups
Jul 16, 2021
956ac8e
Add backbone API
Jul 16, 2021
6b132f2
Cleanups
Jul 16, 2021
3db4dad
Fix
Jul 16, 2021
c54acf1
Add tests
Jul 16, 2021
62175ae
Docs, requirements
Jul 16, 2021
dc2e72c
topic thing
Jul 16, 2021
8eccdf9
Doc fix
Jul 16, 2021
dcfa913
test
Jul 16, 2021
e4f0a69
Add serve
Jul 16, 2021
541c1fb
Merge branch 'master' into feat/speech_recognition
Jul 16, 2021
14795f3
Fix path
Jul 18, 2021
1b8eb08
Swap to audio available
Jul 18, 2021
ab3a437
Small fix
ethanwharris Jul 19, 2021
13eb84f
Some fixes
ethanwharris Jul 19, 2021
af9e0c1
Small fix
ethanwharris Jul 19, 2021
4bbc31c
Small fix
ethanwharris Jul 19, 2021
4336f61
Fix
ethanwharris Jul 19, 2021
51c640a
Updates
ethanwharris Jul 19, 2021
801b752
Fix docs
ethanwharris Jul 19, 2021
683f671
Remove duplicate
Jul 19, 2021
8590052
Add check for audio
Jul 19, 2021
1c98625
Updates
ethanwharris Jul 19, 2021
a208e17
Update CHANGELOG.md
ethanwharris Jul 19, 2021
d9d1a0a
Updates
ethanwharris Jul 19, 2021
9259f44
Update docs
ethanwharris Jul 19, 2021
70607a2
Update docs
ethanwharris Jul 19, 2021
4e6bce7
Update docs
ethanwharris Jul 19, 2021
2d08f21
Add example to CI
ethanwharris Jul 19, 2021
0052f1f
Fix some tests
ethanwharris Jul 19, 2021
0c87f04
Fix some broken tests
ethanwharris Jul 19, 2021
bfe8ea6
Fixes
ethanwharris Jul 19, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions flash/audio/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from flash.audio.speech_recognition import SpeechRecognition, SpeechRecognitionData # noqa: F401
15 changes: 15 additions & 0 deletions flash/audio/speech_recognition/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from flash.audio.speech_recognition.data import SpeechRecognitionData # noqa: F401
from flash.audio.speech_recognition.model import SpeechRecognition # noqa: F401
90 changes: 90 additions & 0 deletions flash/audio/speech_recognition/collate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright 2020 The PyTorch Lightning team and The HuggingFace Team. All rights reserved.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import torch

from flash.core.utilities.imports import _SPEECH_RECOGNITION_AVAILABLE

if _SPEECH_RECOGNITION_AVAILABLE:
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
"""
Data collator that will dynamically pad the inputs received.
Args:
processor (:class:`~transformers.Wav2Vec2Processor`)
The processor used for proccessing the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`,
`optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
max_length_labels (:obj:`int`, `optional`):
Maximum length of the ``labels`` returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
"""

processor: Wav2Vec2Processor
padding: Union[bool, str] = True
max_length: Optional[int] = None
max_length_labels: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
pad_to_multiple_of_labels: Optional[int] = None

def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
input_features = [{"input_values": feature["input_values"]} for feature in features]

batch = self.processor.pad(
input_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)

label_features = [{"input_ids": feature.get("labels")} for feature in features]
# check to ensure labels exist to collate
labels_exist = not any(x['input_ids'] is None for x in label_features)
if labels_exist:
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
max_length=self.max_length_labels,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)

# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

batch["labels"] = labels

return batch
Loading