forked from CorentinJ/Real-Time-Voice-Cloning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcv_2_speakers.py
112 lines (92 loc) · 3.48 KB
/
cv_2_speakers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
from pathlib import Path
from tqdm import tqdm
import argparse
import csv
import codecs
import subprocess
import random
from multiprocess.pool import ThreadPool
from shutil import rmtree
# english
# min=4 = 20,443 speakers
# min=5 = 18,949 speakers
# min=10 = 10,884 speakers
# min=12 = 9,621 speakers
# min=14 = 8,460 speakers
# min=20 = 6,093 speakers
parser = argparse.ArgumentParser(description='Process common voice dataset for a language.')
parser.add_argument('--lang', help='Language to process', type=str)
parser.add_argument('--min', help='Minimum number of files per speaker', type=int, default=5)
parser.add_argument('--max', help='Maximum number of files per speaker', type=int, default=40)
args = parser.parse_args()
base_dir = Path("/datasets_slr/CommonVoice/{0}".format(args.lang))
clips_dir = base_dir.joinpath("clips")
print("Reading Validated.tsv file...")
speaker_hash = {}
with codecs.open(base_dir.joinpath("validated.tsv"), "r", "utf-8") as val_in:
tsvin = csv.DictReader(val_in, delimiter='\t')
# client_id path sentence up_votes down_votes age gender accent
# 05e9d52b02fc87f02758c2e8e1b97d05c23ec0ac7c5b76d964cb1a547ce72f7eefc021cfe23a67b34032eb931e77af13b07cde8d398660abffc411f165d24cb4 common_voice_it_17544185.mp3 Il vuoto assoluto? 2 1
for row in tsvin:
client_id = row["client_id"]
if client_id not in speaker_hash:
speaker_hash[client_id] = []
speaker_hash[client_id].append(row["path"])
print(" - Found {} speakers...".format(len(speaker_hash)))
print("Pruning speakers with less than {} files...".format(args.min))
speakers_to_remove = []
for speaker_id in speaker_hash:
if len(speaker_hash[speaker_id]) < args.min:
speakers_to_remove.append(speaker_id)
print(" - Pruning {} speakers...".format(len(speakers_to_remove)))
for id in speakers_to_remove:
del speaker_hash[id]
print("Reduced speaker pool to {}".format(len(speaker_hash)))
# sort the speaker_id/client_id by
sorted_speakers = sorted(speaker_hash.keys())
# if we have a speakers directory, remove it!
if base_dir.joinpath("speakers").is_dir() == True:
rmtree(base_dir.joinpath("speakers"))
def process_speaker(speaker):
# print("Processing: i: {0} - {1}".format(si, speaker))
speaker_paths = speaker_hash[speaker]
if len(speaker_paths) > args.max:
# shuffle
random.shuffle(speaker_paths)
speaker_paths = speaker_paths[0:args.max]
for speaker_path in speaker_paths:
source_path = clips_dir.joinpath(speaker_path)
# dest_path = base_dir.joinpath("speakers", str(si))
dest_path = base_dir.joinpath("speakers", speaker[:20])
new_name = speaker_path.replace(".mp3", "") + ".wav"
dest_file = dest_path.joinpath(new_name)
# print(" - Source: {0} - Dest: {1}".format(str(source_path), str(dest_file)))
# ensure the dir exists
os.makedirs(dest_path, exist_ok=True)
convert_args = [
"/usr/bin/ffmpeg",
"-y",
"-loglevel",
"fatal",
"-i",
str(source_path),
"-ar",
# "24000",
"16000",
str(dest_file)
]
s = subprocess.call(convert_args)
with ThreadPool(8) as pool:
list(
tqdm(
pool.imap(
process_speaker,
sorted_speakers
),
args.lang,
len(sorted_speakers),
unit="speakers"
)
)
print("Done, thanks for playing...")