Skip to content

Commit

Permalink
Add recipe for ICASSP2024 ICMC-ASR Grand Challenge (lhotse-speech#1172)
Browse files Browse the repository at this point in the history
* Add ICMC-ASR corpus

* Fix isort

---------

Co-authored-by: yfy62 <[email protected]>
  • Loading branch information
2 people authored and flyingleafe committed Oct 11, 2023
1 parent 8d5c2d4 commit 07f0af0
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 1 deletion.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_hifitts`
* - HI-MIA (including HI-MIA-CW)
- :func:`lhotse.recipes.prepare_himia`
* - ICMC-ASR
- :func:`lhotse.recipes.prepare_icmcasr`
* - ICSI
- :func:`lhotse.recipes.prepare_icsi`
* - IWSLT22_Ta
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from .heroico import *
from .hifitts import *
from .himia import *
from .icmcasr import *
from .icsi import *
from .iwslt22_ta import *
from .kespeech import *
Expand Down
30 changes: 30 additions & 0 deletions lhotse/bin/modes/recipes/icmcasr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Dict, List, Optional, Tuple, Union

import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes.icmcasr import prepare_icmcasr
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
def icmcasr(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
num_jobs: int = 1,
):
"""ICMC-ASR data preparation."""
prepare_icmcasr(
corpus_dir=corpus_dir,
output_dir=output_dir,
num_jobs=num_jobs,
)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from .heroico import download_heroico, prepare_heroico
from .hifitts import download_hifitts, prepare_hifitts
from .himia import download_himia, prepare_himia
from .icmcasr import prepare_icmcasr
from .icsi import download_icsi, prepare_icsi
from .iwslt22_ta import prepare_iwslt22_ta
from .kespeech import prepare_kespeech
Expand Down
171 changes: 171 additions & 0 deletions lhotse/recipes/icmcasr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
"""
The ICMC-ASR Grand Challenge dataset is collected in a hybrid electric vehicle with speakers sitting in different positions, including the driver seat and passenger seats. The total number of speakers is over 160 and all of them are native Chinese speakers speaking Mandarin without strong accents. To comprehensively capture speech signals of the entire cockpit, two types of recording devices are used: far-field and near-field recording devices. 8 distributed microphones are placed at four seats in the car, which are the driver's seat (DS01C01, DX01C01), the passenger seat (DS02C01, DX02C01), the rear right seat (DS03C01, DX03C01) and the rear left seat (DS04C01, DX04C01). Additionally, 2 linear microphone arrays, each consisting of 2 microphones, are placed on the display screen (DL01C01, DL02C02) and at the center of the inner sunroof (DL02C01, DL02C02), respectively. All 12 channels of far-field data are time-synchronized and included in the released dataset as far-field data. For transcription purposes, each speaker wears a high-fidelity headphone to record near-field audio, denoted by the seat where the speaker is situated. Specifically, DA01, DA02, DA03, and DA04 represent the driver seat, passenger seat, rear right seat and rear left seat, respectively. The near-field data only have single-channel audio recordings. Additionally, a sizable real noise dataset is provided, following the recording setup of the far-filed data but without speaker talking, to facilitate research of in-car scenario data simulation technology.
Participants can obtain the datasets at https://icmcasr.org - please download the datasets manually.
"""

import logging
import os
from collections import defaultdict
from concurrent.futures.thread import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple, Union

from tqdm.auto import tqdm

from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests, validate_recordings_and_supervisions
from lhotse.recipes.utils import manifests_exist
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike

ICMCASR = ("train",) # TODO: Support all subsets when released
POSITION = ("DA01", "DA02", "DA03", "DA04")


def _parse_utterance(
corpus_dir: Pathlike,
section_path: Pathlike,
) -> Optional[Tuple[Recording, SupervisionSegment]]:
recordings = []
segments = []
for position in POSITION:
text_path = (section_path / (position + ".TextGrid")).resolve()
if not text_path.is_file():
continue

audio_path = (section_path / (position + ".wav")).resolve()
recording_id = (
str(section_path / position)
.replace(str(corpus_dir) + "/", "")
.replace("/", "-")
)

recordings.append(
Recording.from_file(path=audio_path, recording_id=recording_id)
)

with open(text_path) as f:
datalines = f.read().splitlines()

seq = 0
for dataline in datalines:
if "name" in dataline:
speaker = dataline.split('"')[1].strip()
elif "xmin =" in dataline:
start = float(dataline.split("=")[1].strip())
elif "xmax =" in dataline:
end = float(dataline.split("=")[1].strip())
elif "text" in dataline:
text = dataline.split('"')[1].strip()
if len(text) > 0:
if float(recordings[-1].duration) < end:
duration = float(recordings[-1].duration) - start
else:
duration = end - start
segment_id = recording_id + "-" + str(seq)
segments.append(
SupervisionSegment(
id=segment_id,
recording_id=recording_id,
start=start,
duration=duration,
channel=0,
language="Chinese",
speaker=speaker,
text=text,
)
)
seq += 1

return recordings, segments


def _prepare_subset(
subset: str,
corpus_dir: Pathlike,
num_jobs: int = 1,
) -> Tuple[RecordingSet, SupervisionSet]:
"""
Returns the RecodingSet and SupervisionSet given a dataset part.
:param subset: str, the name of the subset.
:param corpus_dir: Pathlike, the path of the data dir.
:return: the RecodingSet and SupervisionSet for train and valid.
"""
corpus_dir = Path(corpus_dir)
part_path = corpus_dir / subset
sections = os.listdir(part_path)

with ThreadPoolExecutor(num_jobs) as ex:
futures = []
recording_set = []
supervision_set = []
for section in tqdm(sections, desc="Distributing tasks"):
section_path = part_path / section
futures.append(ex.submit(_parse_utterance, corpus_dir, section_path))

for future in tqdm(futures, desc="Processing"):
result = future.result()
if result is None:
continue
recordings, segments = result
recording_set.extend(recordings)
supervision_set.extend(segments)

recording_set = RecordingSet.from_recordings(recording_set)
supervision_set = SupervisionSet.from_segments(supervision_set)

# Fix manifests
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)

return recording_set, supervision_set


def prepare_icmcasr(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions
:param corpus_dir: Path to the ICMC-ASR dataset.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)

assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

logging.info("Preparing ICMC-ASR...")

subsets = ICMCASR

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

manifests = defaultdict(dict)

for part in tqdm(subsets, desc="Dataset parts"):
logging.info(f"Processing ICMC-ASR subset: {part}")
if manifests_exist(
part=part,
output_dir=output_dir,
prefix="icmcasr",
suffix="jsonl.gz",
):
logging.info(f"ICMC-ASR subset: {part} already prepared - skipping.")
continue

recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs)

if output_dir is not None:
supervision_set.to_file(
output_dir / f"icmcasr_supervisions_{part}.jsonl.gz"
)
recording_set.to_file(output_dir / f"icmcasr_recordings_{part}.jsonl.gz")

manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}

return manifests
2 changes: 1 addition & 1 deletion lhotse/recipes/librilight.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
English and a small labelled dataset (10h, 1h, and 10 min) plus metrics,
trainable baseline models, and pretrained models that use these datasets.
It is covered in more detail at https://arxiv.org/abs/1912.07875.
It is covered in more detail at https://arxiv.org/abs/1912.07875
This data is very huge - please download manually at LIBRILIGHT_URL.
"""
Expand Down

0 comments on commit 07f0af0

Please sign in to comment.