Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Air Traffic Control (ATC) corpora - various improvements #1070

26 changes: 24 additions & 2 deletions lhotse/recipes/atcosim.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@
from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available, resumable_download
from lhotse.utils import (
Pathlike,
Seconds,
compute_num_samples,
is_module_available,
resumable_download,
)


# note: https://www2.spsc.tugraz.at/ does not support Range request header (2023-05-10)
Expand All @@ -42,6 +48,7 @@ def download_atcosim(
resumable_download(
f"https://www2.spsc.tugraz.at/databases/ATCOSIM/.ISO/{dataset_name}.iso",
filename=iso_path,
completed_file_size=2597789696,
force_download=force_download,
)
if (
Expand Down Expand Up @@ -127,6 +134,19 @@ def text_normalize(
return text


def fix_duration(duration: Seconds, sampling_rate: int) -> Seconds:
"""
A handful of supervision durations do not compute to a round number of
samples at the original recording sampling rate.

This causes problem later using compute_num_frames(). Full description:
https://github.com/lhotse-speech/lhotse/issues/1064

Return: duration that computes to a round number of samples.
"""
return compute_num_samples(duration, sampling_rate) / sampling_rate


def prepare_atcosim(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
Expand All @@ -148,6 +168,8 @@ def prepare_atcosim(
:param unknown_sym: str, unknown symbol
:return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
"""
if not is_module_available("pandas"):
raise ImportError("Please 'pip install pandas' first.")
import pandas as pd

corpus_dir = Path(corpus_dir)
Expand Down Expand Up @@ -202,7 +224,7 @@ def prepare_atcosim(
id=f"atcosim_{row.filename}_{0:06d}_{length100:06d}",
recording_id=row.recording_id,
start=0.0,
duration=row.length_sec,
duration=fix_duration(row.length_sec, recording.sampling_rate),
channel=0,
language="English",
text=text,
Expand Down
3 changes: 2 additions & 1 deletion lhotse/recipes/uwb_atcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def download_uwb_atcc(
resumable_download(
f"https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0001-CCA1-0/{dataset_name}.rar",
filename=rar_path,
completed_file_size=584245376,
force_download=force_download,
)
if (
Expand Down Expand Up @@ -504,7 +505,7 @@ def prepare_uwb_atcc(
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

trs_files = list(corpus_dir.glob("*.trs"))
trs_files = sorted(corpus_dir.glob("*.trs"), key=lambda p: p.name)
assert len(trs_files) == 2657

recordings = []
Expand Down
8 changes: 7 additions & 1 deletion lhotse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,10 @@ def during_docs_build() -> bool:


def resumable_download(
url: str, filename: Pathlike, force_download: bool = False
url: str,
filename: Pathlike,
force_download: bool = False,
completed_file_size: Optional[int] = None,
) -> None:
# Check if the file exists and get its size
if os.path.exists(filename):
Expand All @@ -457,6 +460,9 @@ def resumable_download(
)
os.unlink(filename)
file_size = os.path.getsize(filename)

if completed_file_size and file_size == completed_file_size:
return
else:
file_size = 0

Expand Down
31 changes: 31 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
overlaps,
overspans,
safe_extract,
safe_extract_rar,
streaming_shuffle,
)

Expand Down Expand Up @@ -226,6 +227,36 @@ def test_extract_unsafe_tar_file(unsafe_tar_file):
safe_extract(tar, tmpdir)


# rarfile has no create archive implementation, so for testing purposes, present a TarFile as a RarFile
class TarInfo2RarInfo:
def __init__(self, tarinfo):
self.tarinfo = tarinfo
self.filename = tarinfo.name


class TarFile2RarFile:
def __init__(self, tar):
self.tar = tar

def infolist(self):
return [TarInfo2RarInfo(m) for m in self.tar.getmembers()]

def extractall(self, path, members):
return self.tar.extractall(path, members)


def test_extract_safe_rar_file(safe_tar_file):
with TemporaryDirectory() as tmpdir, tarfile.open(safe_tar_file) as tar:
safe_extract_rar(TarFile2RarFile(tar), path=tmpdir)
assert (Path(tmpdir) / "test/fixtures/audio.json").is_file()


def test_extract_unsafe_rar_file(unsafe_tar_file):
with TemporaryDirectory() as tmpdir, tarfile.open(unsafe_tar_file) as tar:
with pytest.raises(Exception):
safe_extract_rar(TarFile2RarFile(tar), tmpdir)


@pytest.mark.parametrize(
["value", "expected"], [(2, None), ("3", 3), ("(4, 5)", (4, 5)), ("[6, 7]", [6, 7])]
)
Expand Down