huggingface · anton-l · Feb 1, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 9, 2022
diff --git a/datasets/air_dialogue/air_dialogue.py b/datasets/air_dialogue/air_dialogue.py
@@ -218,7 +218,7 @@ def _generate_examples(self, filepath, files):
         # The key is not important, it's more here for legacy reason (legacy from tfds)
 
         for path, f in files:
-            if path == filepath:
+            if path.endswith(filepath):
                 for id_, row in enumerate(f):
                     row = row.decode("utf-8")
                     data = json.loads(row)

diff --git a/datasets/amazon_polarity/amazon_polarity.py b/datasets/amazon_polarity/amazon_polarity.py
@@ -114,7 +114,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, filepath, files):
         """Yields examples."""
         for path, f in files:
-            if path == filepath:
+            if path.endswith(filepath):
                 lines = (line.decode("utf-8") for line in f)
                 data = csv.reader(lines, delimiter=",", quoting=csv.QUOTE_ALL)
                 for id_, row in enumerate(data):

diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """ Common Voice Dataset"""
 
+import os
 
 import datasets
 from datasets.tasks import AutomaticSpeechRecognition
@@ -723,7 +724,7 @@ def _generate_examples(self, files, filepath, path_to_clips):
         all_field_values = {}
         metadata_found = False
         for path, f in files:
-            if path == filepath:
+            if path.endswith(filepath):
                 metadata_found = True
                 lines = f.readlines()
                 headline = lines[0].decode("utf-8")
@@ -737,12 +738,13 @@ def _generate_examples(self, files, filepath, path_to_clips):
                     # set full path for mp3 audio file
                     audio_path = "/".join([path_to_clips, field_values[path_idx]])
                     all_field_values[audio_path] = field_values
-            elif path.startswith(path_to_clips):
+            elif path.rsplit("/", 1)[0].endswith(path_to_clips):
                 assert metadata_found, "Found audio clips before the metadata TSV file."
                 if not all_field_values:
                     break
-                if path in all_field_values:
-                    field_values = all_field_values[path]
+                path_within_archive = path_to_clips + "/" + path.split("/")[-1]
+                if path_within_archive in all_field_values:
+                    field_values = all_field_values[path_within_archive]
 
                     # if data is incomplete, fill with empty values
                     if len(field_values) < len(data_fields):
@@ -751,6 +753,7 @@ def _generate_examples(self, files, filepath, path_to_clips):
                     result = {key: value for key, value in zip(data_fields, field_values)}
 
                     # set audio feature
-                    result["audio"] = {"path": path, "bytes": f.read()}
+                    result["path"] = path
+                    result["audio"] = {"path": path, "bytes": f.read() if not os.path.isfile(path) else None}
 
                     yield path, result
diff --git a/datasets/id_nergrit_corpus/id_nergrit_corpus.py b/datasets/id_nergrit_corpus/id_nergrit_corpus.py
@@ -208,7 +208,7 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, filepath, split, files):
         for path, f in files:
-            if path == filepath:
+            if path.endswith(filepath):
                 guid = 0
                 tokens = []
                 ner_tags = []

diff --git a/datasets/imdb/imdb.py b/datasets/imdb/imdb.py
@@ -100,12 +100,14 @@ def _generate_examples(self, files, split, labeled=True):
         if labeled:
             label_mapping = {"pos": 1, "neg": 0}
             for path, f in files:
-                if path.startswith(f"aclImdb/{split}"):
-                    label = label_mapping.get(path.split("/")[2])
+                # if path.startswith(f"aclImdb/{split}"):
+                if path.split("/")[-4:-2] == ["aclImdb", split]:
+                    label = label_mapping.get(path.split("/")[-2])
                     if label is not None:
                         yield path, {"text": f.read().decode("utf-8"), "label": label}
         else:
             for path, f in files:
-                if path.startswith(f"aclImdb/{split}"):
-                    if path.split("/")[2] == "unsup":
+                # if path.startswith(f"aclImdb/{split}"):
+                if path.split("/")[-4:-2] == ["aclImdb", split]:
+                    if path.split("/")[-2] == "unsup":
                         yield path, {"text": f.read().decode("utf-8"), "label": -1}
diff --git a/datasets/klue/klue.py b/datasets/klue/klue.py
@@ -395,7 +395,7 @@ def _split_generators(self, dl_manager):
     def _generate_examples(self, data_file, files):
         if self.config.name in ["ynat", "sts", "re"]:
             for path, f in files:
-                if path == data_file:
+                if path.endswith(data_file):
                     f = json.load(f)
                     for id_, row in enumerate(f):
                         features = {key: row[key] for key in row if key in self.config.features}
@@ -404,7 +404,7 @@ def _generate_examples(self, data_file, files):
 
         if self.config.name == "nli":
             for path, f in files:
-                if path == data_file:
+                if path.endswith(data_file):
                     f = json.load(f)
                     for id_, row in enumerate(f):
                         # In train file, "source" is written as "genre"
@@ -420,7 +420,7 @@ def _generate_examples(self, data_file, files):
 
         if self.config.name == "ner":
             for path, f in files:
-                if path == data_file:
+                if path.endswith(data_file):
                     f = (line.decode("utf-8") for line in f)
                     reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
                     for _ in range(5):  # skip headers
@@ -442,7 +442,7 @@ def _generate_examples(self, data_file, files):
 
         if self.config.name == "dp":
             for path, f in files:
-                if path == data_file:
+                if path.endswith(data_file):
                     f = (line.decode("utf-8") for line in f)
                     reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
                     for _ in range(5):  # skip headers
@@ -481,7 +481,7 @@ def _generate_examples(self, data_file, files):
 
         if self.config.name == "mrc":
             for path, f in files:
-                if path == data_file:
+                if path.endswith(data_file):
                     f = json.load(f)
                     id_ = -1
                     for example in f["data"]:
@@ -520,7 +520,7 @@ def _generate_examples(self, data_file, files):
 
         if self.config.name == "wos":
             for path, f in files:
-                if path == data_file:
+                if path.endswith(data_file):
                     f = json.load(f)
                     for id_, row in enumerate(f):
                         guid = row["guid"]

diff --git a/datasets/lama/lama.py b/datasets/lama/lama.py
@@ -224,7 +224,8 @@ def _generate_examples(self, filepaths, files, relations_path=None):
             id_ = -1
             inside_trec_directory = False
             for path, f in files:
-                if any(fnmatch(path, pattern) for pattern in filepaths):
+                path_inside_archive = "/".join(path.split("/")[-2:])
+                if any(fnmatch(path_inside_archive, pattern) for pattern in filepaths):
                     inside_trec_directory = True
                     for row in f:
                         data = json.loads(row)
@@ -254,7 +255,8 @@ def _generate_examples(self, filepaths, files, relations_path=None):
             for path, f in files:
                 if not filepaths:
                     break
-                if path in list(filepaths):
+                path_within_archive = "/".join(path.split("/")[-2:])
+                if path_within_archive in list(filepaths):
                     for row in f:
                         data = json.loads(row)
                         if data.get("negated") is not None:
@@ -281,13 +283,14 @@ def _generate_examples(self, filepaths, files, relations_path=None):
                                     "masked_sentence": str(masked_sentence),
                                     "negated": str(""),
                                 }
-                    filepaths.remove(path)
+                    filepaths.remove(path_within_archive)
         elif self.config.name == "squad":
             id_ = -1
             for path, f in files:
                 if not filepaths:
                     break
-                if path in filepaths:
+                path_within_archive = "/".join(path.split("/")[-2:])
+                if path_within_archive in filepaths:
                     for row in f:
                         data = json.loads(row)
                         for masked_sentence in data["masked_sentences"]:
@@ -299,52 +302,52 @@ def _generate_examples(self, filepaths, files, relations_path=None):
                                 "negated": str(data.get("negated", "")),
                                 "masked_sentence": str(masked_sentence),
                             }
-                    filepaths.remove(path)
+                    filepaths.remove(path_within_archive)
         elif self.config.name == "google_re":
             id_ = -1
             for path, f in files:
-                if path in filepaths:
-                    if not filepaths:
-                        break
-                    if path in filepaths:
-                        # from https://github.com/facebookresearch/LAMA/blob/master/scripts/run_experiments.py
-                        if "place_of_birth" in path:
-                            pred = {
-                                "relation": "place_of_birth",
-                                "template": "[X] was born in [Y] .",
-                                "template_negated": "[X] was not born in [Y] .",
-                            }
-                        elif "date_of_birth" in path:
-                            pred = {
-                                "relation": "date_of_birth",
-                                "template": "[X] (born [Y]).",
-                                "template_negated": "[X] (not born [Y]).",
-                            }
-                        else:
-                            pred = {
-                                "relation": "place_of_death",
-                                "template": "[X] died in [Y] .",
-                                "template_negated": "[X] did not die in [Y] .",
+                path_within_archive = "/".join(path.split("/")[-2:])
+                if not filepaths:
+                    break
+                if path_within_archive in filepaths:
+                    # from https://github.com/facebookresearch/LAMA/blob/master/scripts/run_experiments.py
+                    if "place_of_birth" in path:
+                        pred = {
+                            "relation": "place_of_birth",
+                            "template": "[X] was born in [Y] .",
+                            "template_negated": "[X] was not born in [Y] .",
+                        }
+                    elif "date_of_birth" in path:
+                        pred = {
+                            "relation": "date_of_birth",
+                            "template": "[X] (born [Y]).",
+                            "template_negated": "[X] (not born [Y]).",
+                        }
+                    else:
+                        pred = {
+                            "relation": "place_of_death",
+                            "template": "[X] died in [Y] .",
+                            "template_negated": "[X] did not die in [Y] .",
+                        }
+                    for row in f:
+                        data = json.loads(row)
+                        for masked_sentence in data["masked_sentences"]:
+                            id_ += 1
+                            yield id_, {
+                                "pred": str(data["pred"]),
+                                "sub": str(data["sub"]),
+                                "obj": str(data["obj"]),
+                                "evidences": str(data["evidences"]),
+                                "judgments": str(data["judgments"]),
+                                "sub_w": str(data["sub_w"]),
+                                "sub_label": str(data["sub_label"]),
+                                "sub_aliases": str(data["sub_aliases"]),
+                                "obj_w": str(data["obj_w"]),
+                                "obj_label": str(data["obj_label"]),
+                                "obj_aliases": str(data["obj_aliases"]),
+                                "uuid": str(data["uuid"]),
+                                "masked_sentence": str(masked_sentence),
+                                "template": str(pred["template"]),
+                                "template_negated": str(pred["template_negated"]),
                             }
-                        for row in f:
-                            data = json.loads(row)
-                            for masked_sentence in data["masked_sentences"]:
-                                id_ += 1
-                                yield id_, {
-                                    "pred": str(data["pred"]),
-                                    "sub": str(data["sub"]),
-                                    "obj": str(data["obj"]),
-                                    "evidences": str(data["evidences"]),
-                                    "judgments": str(data["judgments"]),
-                                    "sub_w": str(data["sub_w"]),
-                                    "sub_label": str(data["sub_label"]),
-                                    "sub_aliases": str(data["sub_aliases"]),
-                                    "obj_w": str(data["obj_w"]),
-                                    "obj_label": str(data["obj_label"]),
-                                    "obj_aliases": str(data["obj_aliases"]),
-                                    "uuid": str(data["uuid"]),
-                                    "masked_sentence": str(masked_sentence),
-                                    "template": str(pred["template"]),
-                                    "template_negated": str(pred["template_negated"]),
-                                }
-                        filepaths.remove(path)
+                    filepaths.remove(path_within_archive)
diff --git a/datasets/lex_glue/lex_glue.py b/datasets/lex_glue/lex_glue.py
@@ -650,7 +650,7 @@ def _generate_examples(self, filepath, split, files):
             else:
                 SPLIT_RANGES = {"train": (1, 45001), "dev": (45001, 48901), "test": (48901, 52501)}
             for path, f in files:
-                if path == filepath:
+                if path.endswith(filepath):
                     f = (line.decode("utf-8") for line in f)
                     for id_, row in enumerate(list(csv.reader(f))[SPLIT_RANGES[split][0] : SPLIT_RANGES[split][1]]):
                         yield id_, {
@@ -661,7 +661,7 @@ def _generate_examples(self, filepath, split, files):
                     break
         elif self.config.multi_label:
             for path, f in files:
-                if path == filepath:
+                if path.endswith(filepath):
                     for id_, row in enumerate(f):
                         data = json.loads(row.decode("utf-8"))
                         labels = sorted(
@@ -675,7 +675,7 @@ def _generate_examples(self, filepath, split, files):
                     break
         else:
             for path, f in files:
-                if path == filepath:
+                if path.endswith(filepath):
                     for id_, row in enumerate(f):
                         data = json.loads(row.decode("utf-8"))
                         if data["data_type"] == split:

diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
@@ -613,14 +613,15 @@ def _generate_examples(self, path_to_indexs, path_to_datas, archive_files):
                         if len(field_values) != 2:
                             continue
                         filename, sentence = field_values
-                        # set absolute path for audio file
+                        # set absolute path for audio f ile
                         path = f"{path_to_data}/{filename}.wav"
                         sentences[path] = sentence
                 for path, f in files:
-                    if path.startswith(path_to_data):
+                    if path.rsplit("/", 1)[0].endswith(path_to_data):
                         counter += 1
+                        path_within_archive = path_to_data + "/" + path.split("/")[-1]
                         audio = {"path": path, "bytes": f.read()}
-                        yield counter, {"path": path, "audio": audio, "sentence": sentences[path]}
+                        yield counter, {"path": path, "audio": audio, "sentence": sentences[path_within_archive]}
         else:
             for i, path_to_index in enumerate(path_to_indexs):
                 with open(path_to_index, encoding="utf-8") as f:

diff --git a/datasets/speech_commands/speech_commands.py b/datasets/speech_commands/speech_commands.py
@@ -207,7 +207,7 @@ def _generate_examples(self, archive):
             if not path.endswith(".wav"):
                 continue
 
-            word, audio_filename = path.split("/")
+            word, audio_filename = path.rsplit("/", 2)[-2:]
             is_unknown = False
 
             if word == SILENCE:

diff --git a/datasets/vivos/vivos.py b/datasets/vivos/vivos.py
@@ -127,12 +127,17 @@ def _generate_examples(self, prompts_path, path_to_clips, audio_files):
                 }
         inside_clips_dir = False
         id_ = 0
-        for path, f in audio_files:
-            if path.startswith(path_to_clips):
+        for path, f in audio_files:  # path - full local path
+            root_dir, speaker_id, audio_filename = path.rsplit("/", 2)
+            if root_dir.endswith(path_to_clips):
                 inside_clips_dir = True
-                if path in examples:
+                # audio_path_within_archive looks like 'vivos/train/waves/VIVOSSPK01/VIVOSSPK01_R001.wav'
+                audio_path_within_archive = path_to_clips + "/" + speaker_id + "/" + audio_filename
+                if audio_path_within_archive in examples:
                     audio = {"path": path, "bytes": f.read()}
-                    yield id_, {**examples[path], "audio": audio}
+                    result = {**examples[audio_path_within_archive], "audio": audio}
+                    result["path"] = path
+                    yield id_, result
                     id_ += 1
             elif inside_clips_dir:
                 break