Skip to content

Commit

Permalink
remove duplicate definition of manifest read and write func. (#6088)
Browse files Browse the repository at this point in the history
* unify duplicate definitions of manifest read and write func.
* extend support of both Path and str.
* update ones in tutorials.
---------
Signed-off-by: Xuesong Yang <[email protected]>
  • Loading branch information
XuesongYang committed Mar 11, 2023
1 parent c83970c commit 0ea115b
Show file tree
Hide file tree
Showing 14 changed files with 36 additions and 174 deletions.
18 changes: 3 additions & 15 deletions docs/source/asr/examples/kinyarwanda_asr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ It will write the resampled .wav-files to the specified directory and save a new
Data Preprocessing
******************

Before we start training the model on the above manifest files, we need to preprocess the text data. Data pre-processing is done to reduce ambiguity in transcrits. This is an essential step, and often requires moderate expertise in the language.
Before we start training the model on the above manifest files, we need to preprocess the text data. Data pre-processing is done to reduce ambiguity in transcripts. This is an essential step, and often requires moderate expertise in the language.

We used the following script
**prepare_dataset_kinyarwanda.py**:
Expand All @@ -200,28 +200,16 @@ We used the following script
import os
import re
from collections import defaultdict
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
from tqdm.auto import tqdm
def read_manifest(path):
manifest = []
with open(path, 'r') as f:
for line in tqdm(f, desc="Reading manifest data"):
line = line.replace("\n", "")
data = json.loads(line)
manifest.append(data)
return manifest
def write_processed_manifest(data, original_path):
original_manifest_name = os.path.basename(original_path)
new_manifest_name = original_manifest_name.replace(".json", "_processed.json")
manifest_dir = os.path.split(original_path)[0]
filepath = os.path.join(manifest_dir, new_manifest_name)
with open(filepath, 'w') as f:
for datum in tqdm(data, desc="Writing manifest data"):
datum = json.dumps(datum)
f.write(f"{datum}\n")
write_manifest(filepath, data)
print(f"Finished writing manifest: {filepath}")
return filepath
Expand Down
12 changes: 1 addition & 11 deletions examples/asr/experimental/sclite/speech_to_text_sclite.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import torch

from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
from nemo.utils import logging

try:
Expand Down Expand Up @@ -74,17 +75,6 @@ def score_with_sctk(sctk_dir, ref_fname, hyp_fname, out_dir, glm=""):
_ = subprocess.check_output(f"{sclite_path} -h {hypglm} -r {refglm} -i wsj -o all", shell=True)


def read_manifest(manifest_path):
manifest_data = []
with open(manifest_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
manifest_data.append(data)

logging.info('Loaded manifest data')
return manifest_data


can_gpu = torch.cuda.is_available()


Expand Down
18 changes: 10 additions & 8 deletions nemo/collections/asr/parts/utils/manifest_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import os
from collections import Counter
from collections import OrderedDict as od
from typing import Dict, List
from pathlib import Path
from typing import Dict, List, Union

import librosa
import numpy as np
Expand Down Expand Up @@ -362,16 +363,16 @@ def create_manifest(
write_file(manifest_filepath, lines, range(len(lines)))


def read_manifest(manifest: str) -> List[dict]:
def read_manifest(manifest: Union[Path, str]) -> List[dict]:
"""
Read manifest file
Args:
manifest (str): Path to manifest file
manifest (str or Path): Path to manifest file
Returns:
data (list): List of JSON items
"""
manifest = DataStoreObject(manifest)
manifest = DataStoreObject(str(manifest))

data = []
try:
Expand All @@ -385,17 +386,18 @@ def read_manifest(manifest: str) -> List[dict]:
return data


def write_manifest(output_path: str, target_manifest: List[dict]):
def write_manifest(output_path: Union[Path, str], target_manifest: List[dict], ensure_ascii: bool = True):
"""
Write to manifest file
Args:
output_path (str): Path to output manifest file
output_path (str or Path): Path to output manifest file
target_manifest (list): List of manifest file entries
ensure_ascii (bool): default is True, meaning the output is guaranteed to have all incoming non-ASCII characters escaped. If ensure_ascii is false, these characters will be output as-is.
"""
with open(output_path, "w") as outfile:
with open(output_path, "w", encoding="utf-8") as outfile:
for tgt in target_manifest:
json.dump(tgt, outfile)
json.dump(tgt, outfile, ensure_ascii=ensure_ascii)
outfile.write('\n')


Expand Down
18 changes: 0 additions & 18 deletions nemo/collections/tts/parts/utils/tts_dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
# limitations under the License.

import functools
import json
import os
from pathlib import Path
from typing import List

import numpy as np
import torch
Expand All @@ -25,22 +23,6 @@
from torch.special import gammaln


def read_manifest(manifest_path: Path) -> List[dict]:
"""Read manifest file at the given path and convert it to a list of dictionary entries.
"""
with open(manifest_path, "r", encoding="utf-8") as manifest_f:
entries = [json.loads(line) for line in manifest_f]
return entries


def write_manifest(manifest_path: Path, entries: List[dict]) -> None:
"""Convert input entries to JSON format and write them as a manifest at the given path.
"""
output_lines = [f"{json.dumps(entry, ensure_ascii=False)}\n" for entry in entries]
with open(manifest_path, "w", encoding="utf-8") as output_f:
output_f.writelines(output_lines)


def get_sup_data_file_path(entry: dict, base_audio_path: Path, sup_data_path: Path) -> Path:
audio_path = Path(entry["audio_filepath"])
rel_audio_path = audio_path.relative_to(base_audio_path).with_suffix("")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,9 @@
from joblib import Parallel, delayed
from tqdm import tqdm

from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
from nemo.collections.tts.parts.preprocessing.audio_trimming import AudioTrimmer
from nemo.collections.tts.parts.utils.tts_dataset_utils import (
get_base_dir,
normalize_volume,
read_manifest,
write_manifest,
)
from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir, normalize_volume
from nemo.core.config import hydra_runner
from nemo.utils import logging

Expand Down Expand Up @@ -119,8 +115,8 @@ def main(cfg):
config = instantiate(cfg.config)
logging.info(f"Running audio preprocessing with config: {config}")

input_manifest_path = Path(config.input_manifest)
output_manifest_path = Path(config.output_manifest)
input_manifest_path = config.input_manifest
output_manifest_path = config.output_manifest
output_dir = Path(config.output_dir)
num_workers = config.num_workers
max_entries = config.max_entries
Expand Down Expand Up @@ -173,9 +169,9 @@ def main(cfg):
output_durations += output_duration
output_entries.append(output_entry)

write_manifest(manifest_path=output_manifest_path, entries=output_entries)
write_manifest(output_path=output_manifest_path, target_manifest=output_entries, ensure_ascii=False)
if filter_file:
write_manifest(manifest_path=filter_file, entries=filtered_entries)
write_manifest(output_path=str(filter_file), target_manifest=filtered_entries, ensure_ascii=False)

logging.info(f"Duration of original audio: {original_durations / 3600} hours")
logging.info(f"Duration of processed audio: {output_durations / 3600} hours")
Expand Down
3 changes: 2 additions & 1 deletion scripts/dataset_processing/tts/compute_speaker_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
import torch
from tqdm import tqdm

from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir, get_sup_data_file_path, read_manifest
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir, get_sup_data_file_path
from nemo.collections.tts.torch.tts_data_types import Pitch
from nemo.utils import logging

Expand Down
4 changes: 2 additions & 2 deletions scripts/dataset_processing/tts/resynthesize_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@
from omegaconf import DictConfig, OmegaConf
from tqdm import tqdm

from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models.base import SpectrogramGenerator
from nemo.collections.tts.parts.utils.helpers import process_batch, to_device_recursive
from nemo.collections.tts.parts.utils.tts_dataset_utils import read_manifest, write_manifest


def chunks(iterable: Iterable, size: int) -> Iterator[List]:
Expand Down Expand Up @@ -198,7 +198,7 @@ def prepare_paired_mel_spectrograms(
}
output_manifest.append(new_manifest_entry)

write_manifest(output_json_manifest, output_manifest)
write_manifest(output_json_manifest, output_manifest, ensure_ascii=False)


def argument_parser() -> argparse.ArgumentParser:
Expand Down
10 changes: 1 addition & 9 deletions scripts/speaker_tasks/filelist_to_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import sox
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.contrib.concurrent import process_map
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest

random.seed(42)

Expand Down Expand Up @@ -145,15 +146,6 @@ def read_file(filelist, id=-1):
return json_lines


def read_manifest(manifest):
data = []
with open(manifest, 'r', encoding='utf-8') as f:
for line in f:
item = json.loads(line)
data.append(item)
return data


def get_duration(json_line):
dur = json_line['duration']
if dur is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from joblib import Parallel, delayed
from scipy.io import wavfile
from tqdm import tqdm
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest

parser = argparse.ArgumentParser(description='Create synthetic code-switching data audio data from monolingual data')
parser.add_argument("--manifest_path", default=None, type=str, help='Path to CS indermediate manifest', required=True)
Expand Down Expand Up @@ -64,23 +65,6 @@
args = parser.parse_args()


def read_manifest(manifest_path: str):
"""
Args:
manifest_path: absolute path of the manifest file
Returns:
List with manifest entires as elements
"""
data = []

for line in open(manifest_path, 'r'):
data.append(json.loads(line))

return data


def split_list(input_list: list, num_splits: int):
"""
Args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.

import argparse
import json
import logging
import os
import random
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest

# Checks -
# (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation)
Expand All @@ -43,39 +43,6 @@
args = parser.parse_args()


def read_manifest(manifest_path: str):
"""
Args:
manifest_path: absolute path of the manifest file
Returns:
List with manifest entires as elements
"""
data = []

for line in open(manifest_path, 'r'):
data.append(json.loads(line))

return data


def write_manifest(manifest_path: str, data: list):
"""
Args:
manifest_path: absolute path for where to save the manifest file
data: list consisting of entries for the manifest
Returns:
"""

with open(manifest_path, 'w') as outfile:
for elem in data:
s = json.dumps(elem)
outfile.write(s + '\n')


def create_cs_manifest(
data_lang_0: list,
data_lang_1: list,
Expand Down
17 changes: 3 additions & 14 deletions tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -268,28 +268,17 @@
"source": [
"# Manifest Utils\n",
"from tqdm.auto import tqdm\n",
"from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest\n",
"import json\n",
"\n",
"def read_manifest(path):\n",
" manifest = []\n",
" with open(path, 'r') as f:\n",
" for line in tqdm(f, desc=\"Reading manifest data\"):\n",
" line = line.replace(\"\\n\", \"\")\n",
" data = json.loads(line)\n",
" manifest.append(data)\n",
" return manifest\n",
"\n",
"\n",
"def write_processed_manifest(data, original_path):\n",
" original_manifest_name = os.path.basename(original_path)\n",
" new_manifest_name = original_manifest_name.replace(\".json\", \"_processed.json\")\n",
"\n",
" manifest_dir = os.path.split(original_path)[0]\n",
" filepath = os.path.join(manifest_dir, new_manifest_name)\n",
" with open(filepath, 'w') as f:\n",
" for datum in tqdm(data, desc=\"Writing manifest data\"):\n",
" datum = json.dumps(datum)\n",
" f.write(f\"{datum}\\n\")\n",
" write_manifest(filepath, data)\n",
" print(f\"Finished writing manifest: {filepath}\")\n",
" return filepath"
],
Expand Down Expand Up @@ -2164,4 +2153,4 @@
]
}
]
}
}
Loading

0 comments on commit 0ea115b

Please sign in to comment.