Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Return local paths to Common Voice #3664

Closed
wants to merge 19 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 54 additions & 9 deletions datasets/common_voice/common_voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
""" Common Voice Dataset"""


import os

import datasets
from datasets.tasks import AutomaticSpeechRecognition

Expand Down Expand Up @@ -657,55 +659,98 @@ def _info(self):

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
archive = dl_manager.download(_DATA_URL.format(self.config.name))
path_to_data = "/".join(["cv-corpus-6.1-2020-12-11", self.config.name])
archive_path = dl_manager.download(_DATA_URL.format(self.config.name))
if archive_path.startswith("https://"):
path_to_data = "/".join(["cv-corpus-6.1-2020-12-11", self.config.name])
archive_iterator = dl_manager.iter_archive(archive_path)
else:
archive_path = dl_manager.extract(archive_path)
path_to_data = os.path.join(archive_path, "cv-corpus-6.1-2020-12-11", self.config.name)
archive_iterator = None
path_to_clips = "/".join([path_to_data, "clips"])

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"files": dl_manager.iter_archive(archive),
"archive_iterator": archive_iterator,
"filepath": "/".join([path_to_data, "train.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"files": dl_manager.iter_archive(archive),
"archive_iterator": archive_iterator,
"filepath": "/".join([path_to_data, "test.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"files": dl_manager.iter_archive(archive),
"archive_iterator": archive_iterator,
"filepath": "/".join([path_to_data, "dev.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="other",
gen_kwargs={
"files": dl_manager.iter_archive(archive),
"archive_iterator": archive_iterator,
"filepath": "/".join([path_to_data, "other.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="invalidated",
gen_kwargs={
"files": dl_manager.iter_archive(archive),
"archive_iterator": archive_iterator,
"filepath": "/".join([path_to_data, "invalidated.tsv"]),
"path_to_clips": path_to_clips,
},
),
]

def _generate_examples(self, files, filepath, path_to_clips):
def _generate_examples(self, archive_iterator, filepath, path_to_clips):
"""Yields examples."""
if archive_iterator is not None:
yield from self._generate_examples_streaming(archive_iterator, filepath, path_to_clips)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The key part: if we get an archive files iterator, then use the new streaming logic, otherwise use the old pre-streaming logic.


data_fields = list(self._info().features.keys())

# audio is not a header of the csv files
data_fields.remove("audio")
path_idx = data_fields.index("path")

with open(filepath, encoding="utf-8") as f:
lines = f.readlines()
headline = lines[0]

column_names = headline.strip().split("\t")
assert (
column_names == data_fields
), f"The file should have {data_fields} as column names, but has {column_names}"

for id_, line in enumerate(lines[1:]):
field_values = line.strip().split("\t")

# set absolute path for mp3 audio file
field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx])

# if data is incomplete, fill with empty values
if len(field_values) < len(data_fields):
field_values += (len(data_fields) - len(field_values)) * ["''"]

result = {key: value for key, value in zip(data_fields, field_values)}

# set audio feature
result["audio"] = field_values[path_idx]

yield id_, result

def _generate_examples_streaming(self, archive_iterator, filepath, path_to_clips):
"""Yields examples in streaming mode."""
data_fields = list(self._info().features.keys())

# audio is not a header of the csv files
Expand All @@ -714,7 +759,7 @@ def _generate_examples(self, files, filepath, path_to_clips):

all_field_values = {}
metadata_found = False
for path, f in files:
for path, f in archive_iterator:
if path == filepath:
metadata_found = True
lines = f.readlines()
Expand Down