Skip to content

Commit

Permalink
Air Traffic Control (ATC) corpora - various improvements (#1070)
Browse files Browse the repository at this point in the history
* safe_extract_rar unit test

* atcosim: pandas module availability check

* atcosim: apply fix to supervision durations

* uwb_atcc: sort file names before processing to ensure reproducibility

* resumable_download: add optional completed_file_size argument

Do not bother downloading file if the file size on disk matches the
complete_file_size argument. This saves http request when file is already
downloaded or stored on read-only file system.

* atcosim,uwb_atcc: call resumable_download with completed file size

---------

Co-authored-by: Piotr Żelasko <[email protected]>
  • Loading branch information
rouseabout and pzelasko authored May 26, 2023
1 parent d430776 commit 3071ade
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 4 deletions.
26 changes: 24 additions & 2 deletions lhotse/recipes/atcosim.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@
from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available, resumable_download
from lhotse.utils import (
Pathlike,
Seconds,
compute_num_samples,
is_module_available,
resumable_download,
)


# note: https://www2.spsc.tugraz.at/ does not support Range request header (2023-05-10)
Expand All @@ -42,6 +48,7 @@ def download_atcosim(
resumable_download(
f"https://www2.spsc.tugraz.at/databases/ATCOSIM/.ISO/{dataset_name}.iso",
filename=iso_path,
completed_file_size=2597789696,
force_download=force_download,
)
if (
Expand Down Expand Up @@ -127,6 +134,19 @@ def text_normalize(
return text


def fix_duration(duration: Seconds, sampling_rate: int) -> Seconds:
"""
A handful of supervision durations do not compute to a round number of
samples at the original recording sampling rate.
This causes problem later using compute_num_frames(). Full description:
https://github.com/lhotse-speech/lhotse/issues/1064
Return: duration that computes to a round number of samples.
"""
return compute_num_samples(duration, sampling_rate) / sampling_rate


def prepare_atcosim(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
Expand All @@ -148,6 +168,8 @@ def prepare_atcosim(
:param unknown_sym: str, unknown symbol
:return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
"""
if not is_module_available("pandas"):
raise ImportError("Please 'pip install pandas' first.")
import pandas as pd

corpus_dir = Path(corpus_dir)
Expand Down Expand Up @@ -202,7 +224,7 @@ def prepare_atcosim(
id=f"atcosim_{row.filename}_{0:06d}_{length100:06d}",
recording_id=row.recording_id,
start=0.0,
duration=row.length_sec,
duration=fix_duration(row.length_sec, recording.sampling_rate),
channel=0,
language="English",
text=text,
Expand Down
3 changes: 2 additions & 1 deletion lhotse/recipes/uwb_atcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def download_uwb_atcc(
resumable_download(
f"https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0001-CCA1-0/{dataset_name}.rar",
filename=rar_path,
completed_file_size=584245376,
force_download=force_download,
)
if (
Expand Down Expand Up @@ -504,7 +505,7 @@ def prepare_uwb_atcc(
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

trs_files = list(corpus_dir.glob("*.trs"))
trs_files = sorted(corpus_dir.glob("*.trs"), key=lambda p: p.name)
assert len(trs_files) == 2657

recordings = []
Expand Down
8 changes: 7 additions & 1 deletion lhotse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,10 @@ def during_docs_build() -> bool:


def resumable_download(
url: str, filename: Pathlike, force_download: bool = False
url: str,
filename: Pathlike,
force_download: bool = False,
completed_file_size: Optional[int] = None,
) -> None:
# Check if the file exists and get its size
if os.path.exists(filename):
Expand All @@ -457,6 +460,9 @@ def resumable_download(
)
os.unlink(filename)
file_size = os.path.getsize(filename)

if completed_file_size and file_size == completed_file_size:
return
else:
file_size = 0

Expand Down
31 changes: 31 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
overlaps,
overspans,
safe_extract,
safe_extract_rar,
streaming_shuffle,
)

Expand Down Expand Up @@ -226,6 +227,36 @@ def test_extract_unsafe_tar_file(unsafe_tar_file):
safe_extract(tar, tmpdir)


# rarfile has no create archive implementation, so for testing purposes, present a TarFile as a RarFile
class TarInfo2RarInfo:
def __init__(self, tarinfo):
self.tarinfo = tarinfo
self.filename = tarinfo.name


class TarFile2RarFile:
def __init__(self, tar):
self.tar = tar

def infolist(self):
return [TarInfo2RarInfo(m) for m in self.tar.getmembers()]

def extractall(self, path, members):
return self.tar.extractall(path, members)


def test_extract_safe_rar_file(safe_tar_file):
with TemporaryDirectory() as tmpdir, tarfile.open(safe_tar_file) as tar:
safe_extract_rar(TarFile2RarFile(tar), path=tmpdir)
assert (Path(tmpdir) / "test/fixtures/audio.json").is_file()


def test_extract_unsafe_rar_file(unsafe_tar_file):
with TemporaryDirectory() as tmpdir, tarfile.open(unsafe_tar_file) as tar:
with pytest.raises(Exception):
safe_extract_rar(TarFile2RarFile(tar), tmpdir)


@pytest.mark.parametrize(
["value", "expected"], [(2, None), ("3", 3), ("(4, 5)", (4, 5)), ("[6, 7]", [6, 7])]
)
Expand Down

0 comments on commit 3071ade

Please sign in to comment.