forked from speechbrain/speechbrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.py
104 lines (85 loc) · 2.77 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
from speechbrain.dataio.dataio import read_audio
from speechbrain.utils.logger import get_logger
try:
import pandas as pd
except ImportError:
err_msg = (
"The optional dependency pandas must be installed to run this recipe.\n"
)
err_msg += "Install using `pip install pandas`.\n"
raise ImportError(err_msg)
logger = get_logger(__name__)
def prepare_FSC(data_folder, save_folder, skip_prep=False):
"""
This function prepares the Fluent Speech Commands dataset.
data_folder : path to dataset.
save_folder: folder where the manifest files will be stored.
skip_prep: If True, skip data preparation
"""
if skip_prep:
return
splits = [
"train",
"valid",
"test",
]
ID_start = 0 # needed to have a unique ID for each audio
for split in splits:
new_filename = os.path.join(save_folder, split) + ".csv"
if os.path.exists(new_filename):
continue
logger.info("Preparing %s..." % new_filename)
ID = []
duration = []
wav = []
wav_format = []
wav_opts = []
spk_id = []
spk_id_format = []
spk_id_opts = []
semantics = []
semantics_format = []
semantics_opts = []
transcript = []
transcript_format = []
transcript_opts = []
df = pd.read_csv(os.path.join(data_folder, "data", split) + "_data.csv")
for i in range(len(df)):
ID.append(ID_start + i)
signal = read_audio(os.path.join(data_folder, df.path[i]))
duration.append(signal.shape[0] / 16000)
wav.append(os.path.join(data_folder, df.path[i]))
wav_format.append("wav")
wav_opts.append(None)
spk_id.append(df.speakerId[i])
spk_id_format.append("string")
spk_id_opts.append(None)
transcript_ = df.transcription[i]
transcript.append(transcript_)
transcript_format.append("string")
transcript_opts.append(None)
semantics_ = (
'{"action:" "'
+ df.action[i]
+ '"| "object": "'
+ df.object[i]
+ '"| "location": "'
+ df.location[i]
+ '"}'
)
semantics.append(semantics_)
semantics_format.append("string")
semantics_opts.append(None)
new_df = pd.DataFrame(
{
"ID": ID,
"duration": duration,
"wav": wav,
"spk_id": spk_id,
"semantics": semantics,
"transcript": transcript,
}
)
new_df.to_csv(new_filename, index=False)
ID_start += len(df)