-
Notifications
You must be signed in to change notification settings - Fork 225
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fluent Speech Commands dataset, SLU task (#1272)
* Implement slu data prep * Bug fixes * Remove commented code, add to corpora list in docs Signed-off-by: Xinyuan Li <[email protected]> * Move pandas import to local scope Signed-off-by: Xinyuan Li <[email protected]> --------- Signed-off-by: Xinyuan Li <[email protected]> Co-authored-by: Xinyuan Li <[email protected]> Co-authored-by: Piotr Żelasko <[email protected]>
- Loading branch information
1 parent
f26ff4b
commit e3fd608
Showing
5 changed files
with
146 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from typing import List, Optional, Sequence, Tuple, Union | ||
|
||
import click | ||
|
||
from lhotse.bin.modes import prepare | ||
from lhotse.recipes.slu import prepare_slu | ||
from lhotse.utils import Pathlike | ||
|
||
|
||
@prepare.command(context_settings=dict(show_default=True)) | ||
@click.argument("corpus_dir", type=click.Path()) | ||
@click.argument("output_dir", type=click.Path()) | ||
def slu( | ||
corpus_dir: Pathlike, | ||
output_dir: Pathlike, | ||
): | ||
prepare_slu(corpus_dir=corpus_dir, output_dir=output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import glob | ||
import json | ||
import logging | ||
from collections import defaultdict | ||
from itertools import groupby | ||
from pathlib import Path | ||
from typing import Dict, List, NamedTuple, Optional, Tuple, Union | ||
|
||
from tqdm import tqdm | ||
|
||
from lhotse import fix_manifests, validate_recordings_and_supervisions | ||
from lhotse.audio import AudioSource, Recording, RecordingSet | ||
from lhotse.supervision import SupervisionSegment, SupervisionSet | ||
from lhotse.utils import Pathlike, Seconds, is_module_available | ||
|
||
|
||
def prepare_slu( | ||
corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None | ||
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: | ||
|
||
import pandas | ||
|
||
corpus_dir = Path(corpus_dir) | ||
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" | ||
|
||
if output_dir is not None: | ||
output_dir = Path(output_dir) | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
data = { | ||
"train": pandas.read_csv( | ||
str(corpus_dir) + "/data/train_data.csv", index_col=0, header=0 | ||
), | ||
"valid": pandas.read_csv( | ||
str(corpus_dir) + "/data/valid_data.csv", index_col=0, header=0 | ||
), | ||
"test": pandas.read_csv( | ||
str(corpus_dir) + "/data/test_data.csv", index_col=0, header=0 | ||
), | ||
} | ||
train_wavs = [ | ||
str(corpus_dir) + "/" + path_to_wav | ||
for path_to_wav in data["train"]["path"].tolist() | ||
] | ||
valid_wavs = [ | ||
str(corpus_dir) + "/" + path_to_wav | ||
for path_to_wav in data["valid"]["path"].tolist() | ||
] | ||
test_wavs = [ | ||
str(corpus_dir) + "/" + path_to_wav | ||
for path_to_wav in data["test"]["path"].tolist() | ||
] | ||
|
||
transcripts = { | ||
"train": data["train"]["transcription"].tolist(), | ||
"valid": data["valid"]["transcription"].tolist(), | ||
"test": data["test"]["transcription"].tolist(), | ||
} | ||
|
||
frames = { | ||
"train": list( | ||
i | ||
for i in zip( | ||
data["train"]["action"].tolist(), | ||
data["train"]["object"].tolist(), | ||
data["train"]["location"].tolist(), | ||
) | ||
), | ||
"valid": list( | ||
i | ||
for i in zip( | ||
data["valid"]["action"].tolist(), | ||
data["valid"]["object"].tolist(), | ||
data["valid"]["location"].tolist(), | ||
) | ||
), | ||
"test": list( | ||
i | ||
for i in zip( | ||
data["test"]["action"].tolist(), | ||
data["test"]["object"].tolist(), | ||
data["test"]["location"].tolist(), | ||
) | ||
), | ||
} | ||
|
||
manifests = defaultdict(dict) | ||
for name, dataset in zip( | ||
["train", "valid", "test"], [train_wavs, valid_wavs, test_wavs] | ||
): | ||
recordings = [] | ||
for wav in tqdm(dataset): | ||
recording = Recording.from_file(wav) | ||
recordings.append(recording) | ||
recording_set = RecordingSet.from_recordings(recordings) | ||
|
||
supervisions = [] | ||
for id, recording in tqdm(enumerate(recording_set)): | ||
supervisions.append( | ||
SupervisionSegment( | ||
id=id, | ||
recording_id=recording.id, | ||
start=0, | ||
duration=recording.duration, | ||
channel=0, | ||
text=transcripts[name][id], | ||
custom={"frames": frames[name][id]}, | ||
) | ||
) | ||
supervision_set = SupervisionSet.from_segments(supervisions) | ||
recording_set, supervision_set = fix_manifests(recording_set, supervision_set) | ||
validate_recordings_and_supervisions(recording_set, supervision_set) | ||
manifests[name] = { | ||
"recordings": recording_set, | ||
"supervisions": supervision_set, | ||
} | ||
|
||
if output_dir is not None: | ||
for name in ["train", "valid", "test"]: | ||
manifests[name]["recordings"].to_file( | ||
output_dir / ("slu_recordings_" + name + ".jsonl.gz") | ||
) | ||
manifests[name]["supervisions"].to_file( | ||
output_dir / ("slu_supervisions_" + name + ".jsonl.gz") | ||
) |