Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Return local paths to Common Voice #3664

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/air_dialogue/air_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _generate_examples(self, filepath, files):
# The key is not important, it's more here for legacy reason (legacy from tfds)

for path, f in files:
if path == filepath:
if path.endswith(filepath):
for id_, row in enumerate(f):
row = row.decode("utf-8")
data = json.loads(row)
Expand Down
2 changes: 1 addition & 1 deletion datasets/amazon_polarity/amazon_polarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def _split_generators(self, dl_manager):
def _generate_examples(self, filepath, files):
"""Yields examples."""
for path, f in files:
if path == filepath:
if path.endswith(filepath):
lines = (line.decode("utf-8") for line in f)
data = csv.reader(lines, delimiter=",", quoting=csv.QUOTE_ALL)
for id_, row in enumerate(data):
Expand Down
13 changes: 8 additions & 5 deletions datasets/common_voice/common_voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
""" Common Voice Dataset"""

import os

import datasets
from datasets.tasks import AutomaticSpeechRecognition
Expand Down Expand Up @@ -723,7 +724,7 @@ def _generate_examples(self, files, filepath, path_to_clips):
all_field_values = {}
metadata_found = False
for path, f in files:
if path == filepath:
if path.endswith(filepath):
metadata_found = True
lines = f.readlines()
headline = lines[0].decode("utf-8")
Expand All @@ -737,12 +738,13 @@ def _generate_examples(self, files, filepath, path_to_clips):
# set full path for mp3 audio file
audio_path = "/".join([path_to_clips, field_values[path_idx]])
all_field_values[audio_path] = field_values
elif path.startswith(path_to_clips):
elif path.rsplit("/", 1)[0].endswith(path_to_clips):
assert metadata_found, "Found audio clips before the metadata TSV file."
if not all_field_values:
break
if path in all_field_values:
field_values = all_field_values[path]
path_within_archive = path_to_clips + "/" + path.split("/")[-1]
if path_within_archive in all_field_values:
field_values = all_field_values[path_within_archive]

# if data is incomplete, fill with empty values
if len(field_values) < len(data_fields):
Expand All @@ -751,6 +753,7 @@ def _generate_examples(self, files, filepath, path_to_clips):
result = {key: value for key, value in zip(data_fields, field_values)}

# set audio feature
result["audio"] = {"path": path, "bytes": f.read()}
result["path"] = path
result["audio"] = {"path": path, "bytes": f.read() if not os.path.isfile(path) else None}

yield path, result
2 changes: 1 addition & 1 deletion datasets/id_nergrit_corpus/id_nergrit_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def _split_generators(self, dl_manager):

def _generate_examples(self, filepath, split, files):
for path, f in files:
if path == filepath:
if path.endswith(filepath):
guid = 0
tokens = []
ner_tags = []
Expand Down
10 changes: 6 additions & 4 deletions datasets/imdb/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,14 @@ def _generate_examples(self, files, split, labeled=True):
if labeled:
label_mapping = {"pos": 1, "neg": 0}
for path, f in files:
if path.startswith(f"aclImdb/{split}"):
label = label_mapping.get(path.split("/")[2])
# if path.startswith(f"aclImdb/{split}"):
if path.split("/")[-4:-2] == ["aclImdb", split]:
label = label_mapping.get(path.split("/")[-2])
if label is not None:
yield path, {"text": f.read().decode("utf-8"), "label": label}
else:
for path, f in files:
if path.startswith(f"aclImdb/{split}"):
if path.split("/")[2] == "unsup":
# if path.startswith(f"aclImdb/{split}"):
if path.split("/")[-4:-2] == ["aclImdb", split]:
if path.split("/")[-2] == "unsup":
yield path, {"text": f.read().decode("utf-8"), "label": -1}
12 changes: 6 additions & 6 deletions datasets/klue/klue.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def _split_generators(self, dl_manager):
def _generate_examples(self, data_file, files):
if self.config.name in ["ynat", "sts", "re"]:
for path, f in files:
if path == data_file:
if path.endswith(data_file):
f = json.load(f)
for id_, row in enumerate(f):
features = {key: row[key] for key in row if key in self.config.features}
Expand All @@ -404,7 +404,7 @@ def _generate_examples(self, data_file, files):

if self.config.name == "nli":
for path, f in files:
if path == data_file:
if path.endswith(data_file):
f = json.load(f)
for id_, row in enumerate(f):
# In train file, "source" is written as "genre"
Expand All @@ -420,7 +420,7 @@ def _generate_examples(self, data_file, files):

if self.config.name == "ner":
for path, f in files:
if path == data_file:
if path.endswith(data_file):
f = (line.decode("utf-8") for line in f)
reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
for _ in range(5): # skip headers
Expand All @@ -442,7 +442,7 @@ def _generate_examples(self, data_file, files):

if self.config.name == "dp":
for path, f in files:
if path == data_file:
if path.endswith(data_file):
f = (line.decode("utf-8") for line in f)
reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
for _ in range(5): # skip headers
Expand Down Expand Up @@ -481,7 +481,7 @@ def _generate_examples(self, data_file, files):

if self.config.name == "mrc":
for path, f in files:
if path == data_file:
if path.endswith(data_file):
f = json.load(f)
id_ = -1
for example in f["data"]:
Expand Down Expand Up @@ -520,7 +520,7 @@ def _generate_examples(self, data_file, files):

if self.config.name == "wos":
for path, f in files:
if path == data_file:
if path.endswith(data_file):
f = json.load(f)
for id_, row in enumerate(f):
guid = row["guid"]
Expand Down
101 changes: 52 additions & 49 deletions datasets/lama/lama.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,8 @@ def _generate_examples(self, filepaths, files, relations_path=None):
id_ = -1
inside_trec_directory = False
for path, f in files:
if any(fnmatch(path, pattern) for pattern in filepaths):
path_inside_archive = "/".join(path.split("/")[-2:])
if any(fnmatch(path_inside_archive, pattern) for pattern in filepaths):
inside_trec_directory = True
for row in f:
data = json.loads(row)
Expand Down Expand Up @@ -254,7 +255,8 @@ def _generate_examples(self, filepaths, files, relations_path=None):
for path, f in files:
if not filepaths:
break
if path in list(filepaths):
path_within_archive = "/".join(path.split("/")[-2:])
if path_within_archive in list(filepaths):
for row in f:
data = json.loads(row)
if data.get("negated") is not None:
Expand All @@ -281,13 +283,14 @@ def _generate_examples(self, filepaths, files, relations_path=None):
"masked_sentence": str(masked_sentence),
"negated": str(""),
}
filepaths.remove(path)
filepaths.remove(path_within_archive)
elif self.config.name == "squad":
id_ = -1
for path, f in files:
if not filepaths:
break
if path in filepaths:
path_within_archive = "/".join(path.split("/")[-2:])
if path_within_archive in filepaths:
for row in f:
data = json.loads(row)
for masked_sentence in data["masked_sentences"]:
Expand All @@ -299,52 +302,52 @@ def _generate_examples(self, filepaths, files, relations_path=None):
"negated": str(data.get("negated", "")),
"masked_sentence": str(masked_sentence),
}
filepaths.remove(path)
filepaths.remove(path_within_archive)
elif self.config.name == "google_re":
id_ = -1
for path, f in files:
if path in filepaths:
if not filepaths:
break
if path in filepaths:
# from https://github.com/facebookresearch/LAMA/blob/master/scripts/run_experiments.py
if "place_of_birth" in path:
pred = {
"relation": "place_of_birth",
"template": "[X] was born in [Y] .",
"template_negated": "[X] was not born in [Y] .",
}
elif "date_of_birth" in path:
pred = {
"relation": "date_of_birth",
"template": "[X] (born [Y]).",
"template_negated": "[X] (not born [Y]).",
}
else:
pred = {
"relation": "place_of_death",
"template": "[X] died in [Y] .",
"template_negated": "[X] did not die in [Y] .",
path_within_archive = "/".join(path.split("/")[-2:])
if not filepaths:
break
if path_within_archive in filepaths:
# from https://github.com/facebookresearch/LAMA/blob/master/scripts/run_experiments.py
if "place_of_birth" in path:
pred = {
"relation": "place_of_birth",
"template": "[X] was born in [Y] .",
"template_negated": "[X] was not born in [Y] .",
}
elif "date_of_birth" in path:
pred = {
"relation": "date_of_birth",
"template": "[X] (born [Y]).",
"template_negated": "[X] (not born [Y]).",
}
else:
pred = {
"relation": "place_of_death",
"template": "[X] died in [Y] .",
"template_negated": "[X] did not die in [Y] .",
}
for row in f:
data = json.loads(row)
for masked_sentence in data["masked_sentences"]:
id_ += 1
yield id_, {
"pred": str(data["pred"]),
"sub": str(data["sub"]),
"obj": str(data["obj"]),
"evidences": str(data["evidences"]),
"judgments": str(data["judgments"]),
"sub_w": str(data["sub_w"]),
"sub_label": str(data["sub_label"]),
"sub_aliases": str(data["sub_aliases"]),
"obj_w": str(data["obj_w"]),
"obj_label": str(data["obj_label"]),
"obj_aliases": str(data["obj_aliases"]),
"uuid": str(data["uuid"]),
"masked_sentence": str(masked_sentence),
"template": str(pred["template"]),
"template_negated": str(pred["template_negated"]),
}
for row in f:
data = json.loads(row)
for masked_sentence in data["masked_sentences"]:
id_ += 1
yield id_, {
"pred": str(data["pred"]),
"sub": str(data["sub"]),
"obj": str(data["obj"]),
"evidences": str(data["evidences"]),
"judgments": str(data["judgments"]),
"sub_w": str(data["sub_w"]),
"sub_label": str(data["sub_label"]),
"sub_aliases": str(data["sub_aliases"]),
"obj_w": str(data["obj_w"]),
"obj_label": str(data["obj_label"]),
"obj_aliases": str(data["obj_aliases"]),
"uuid": str(data["uuid"]),
"masked_sentence": str(masked_sentence),
"template": str(pred["template"]),
"template_negated": str(pred["template_negated"]),
}
filepaths.remove(path)
filepaths.remove(path_within_archive)
6 changes: 3 additions & 3 deletions datasets/lex_glue/lex_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ def _generate_examples(self, filepath, split, files):
else:
SPLIT_RANGES = {"train": (1, 45001), "dev": (45001, 48901), "test": (48901, 52501)}
for path, f in files:
if path == filepath:
if path.endswith(filepath):
f = (line.decode("utf-8") for line in f)
for id_, row in enumerate(list(csv.reader(f))[SPLIT_RANGES[split][0] : SPLIT_RANGES[split][1]]):
yield id_, {
Expand All @@ -661,7 +661,7 @@ def _generate_examples(self, filepath, split, files):
break
elif self.config.multi_label:
for path, f in files:
if path == filepath:
if path.endswith(filepath):
for id_, row in enumerate(f):
data = json.loads(row.decode("utf-8"))
labels = sorted(
Expand All @@ -675,7 +675,7 @@ def _generate_examples(self, filepath, split, files):
break
else:
for path, f in files:
if path == filepath:
if path.endswith(filepath):
for id_, row in enumerate(f):
data = json.loads(row.decode("utf-8"))
if data["data_type"] == split:
Expand Down
7 changes: 4 additions & 3 deletions datasets/openslr/openslr.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,14 +613,15 @@ def _generate_examples(self, path_to_indexs, path_to_datas, archive_files):
if len(field_values) != 2:
continue
filename, sentence = field_values
# set absolute path for audio file
# set absolute path for audio f ile
path = f"{path_to_data}/{filename}.wav"
sentences[path] = sentence
for path, f in files:
if path.startswith(path_to_data):
if path.rsplit("/", 1)[0].endswith(path_to_data):
counter += 1
path_within_archive = path_to_data + "/" + path.split("/")[-1]
audio = {"path": path, "bytes": f.read()}
yield counter, {"path": path, "audio": audio, "sentence": sentences[path]}
yield counter, {"path": path, "audio": audio, "sentence": sentences[path_within_archive]}
else:
for i, path_to_index in enumerate(path_to_indexs):
with open(path_to_index, encoding="utf-8") as f:
Expand Down
2 changes: 1 addition & 1 deletion datasets/speech_commands/speech_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def _generate_examples(self, archive):
if not path.endswith(".wav"):
continue

word, audio_filename = path.split("/")
word, audio_filename = path.rsplit("/", 2)[-2:]
is_unknown = False

if word == SILENCE:
Expand Down
13 changes: 9 additions & 4 deletions datasets/vivos/vivos.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,17 @@ def _generate_examples(self, prompts_path, path_to_clips, audio_files):
}
inside_clips_dir = False
id_ = 0
for path, f in audio_files:
if path.startswith(path_to_clips):
for path, f in audio_files: # path - full local path
root_dir, speaker_id, audio_filename = path.rsplit("/", 2)
if root_dir.endswith(path_to_clips):
inside_clips_dir = True
if path in examples:
# audio_path_within_archive looks like 'vivos/train/waves/VIVOSSPK01/VIVOSSPK01_R001.wav'
audio_path_within_archive = path_to_clips + "/" + speaker_id + "/" + audio_filename
if audio_path_within_archive in examples:
audio = {"path": path, "bytes": f.read()}
yield id_, {**examples[path], "audio": audio}
result = {**examples[audio_path_within_archive], "audio": audio}
result["path"] = path
yield id_, result
id_ += 1
elif inside_clips_dir:
break
Loading