remove duplicate definition of manifest read and write func. (#6088)

* unify duplicate definitions of manifest read and write func. * extend support of both Path and str. * update ones in tutorials. --------- Signed-off-by: Xuesong Yang <[email protected]>
NVIDIA · Mar 11, 2023 · 0ea115b · 0ea115b
1 parent c83970c
commit 0ea115b
Show file tree

Hide file tree

Showing 14 changed files with 36 additions and 174 deletions.
diff --git a/docs/source/asr/examples/kinyarwanda_asr.rst b/docs/source/asr/examples/kinyarwanda_asr.rst
@@ -189,7 +189,7 @@ It will write the resampled .wav-files to the specified directory and save a new
 Data Preprocessing
 ******************
 
-Before we start training the model on the above manifest files, we need to preprocess the text data. Data pre-processing is done to reduce ambiguity in transcrits. This is an essential step, and often requires moderate expertise in the language.
+Before we start training the model on the above manifest files, we need to preprocess the text data. Data pre-processing is done to reduce ambiguity in transcripts. This is an essential step, and often requires moderate expertise in the language.
 
 We used the following script
 **prepare_dataset_kinyarwanda.py**:
@@ -200,28 +200,16 @@ We used the following script
     import os
     import re
     from collections import defaultdict
+    from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
     from tqdm.auto import tqdm
 
-    def read_manifest(path):
-        manifest = []
-        with open(path, 'r') as f:
-            for line in tqdm(f, desc="Reading manifest data"):
-                line = line.replace("\n", "")
-                data = json.loads(line)
-                manifest.append(data)
-        return manifest
-
-
     def write_processed_manifest(data, original_path):
         original_manifest_name = os.path.basename(original_path)
         new_manifest_name = original_manifest_name.replace(".json", "_processed.json")
 
         manifest_dir = os.path.split(original_path)[0]
         filepath = os.path.join(manifest_dir, new_manifest_name)
-        with open(filepath, 'w') as f:
-            for datum in tqdm(data, desc="Writing manifest data"):
-                datum = json.dumps(datum)
-                f.write(f"{datum}\n")
+        write_manifest(filepath, data)
         print(f"Finished writing manifest: {filepath}")
         return filepath
 

diff --git a/examples/asr/experimental/sclite/speech_to_text_sclite.py b/examples/asr/experimental/sclite/speech_to_text_sclite.py
@@ -39,6 +39,7 @@
 import torch
 
 from nemo.collections.asr.models import ASRModel
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.utils import logging
 
 try:
@@ -74,17 +75,6 @@ def score_with_sctk(sctk_dir, ref_fname, hyp_fname, out_dir, glm=""):
     _ = subprocess.check_output(f"{sclite_path} -h {hypglm}  -r {refglm} -i wsj -o all", shell=True)
 
 
-def read_manifest(manifest_path):
-    manifest_data = []
-    with open(manifest_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            data = json.loads(line)
-            manifest_data.append(data)
-
-    logging.info('Loaded manifest data')
-    return manifest_data
-
-
 can_gpu = torch.cuda.is_available()
 
 

diff --git a/nemo/collections/asr/parts/utils/manifest_utils.py b/nemo/collections/asr/parts/utils/manifest_utils.py
@@ -16,7 +16,8 @@
 import os
 from collections import Counter
 from collections import OrderedDict as od
-from typing import Dict, List
+from pathlib import Path
+from typing import Dict, List, Union
 
 import librosa
 import numpy as np
@@ -362,16 +363,16 @@ def create_manifest(
     write_file(manifest_filepath, lines, range(len(lines)))
 
 
-def read_manifest(manifest: str) -> List[dict]:
+def read_manifest(manifest: Union[Path, str]) -> List[dict]:
     """
     Read manifest file
 
     Args:
-        manifest (str): Path to manifest file
+        manifest (str or Path): Path to manifest file
     Returns:
         data (list): List of JSON items
     """
-    manifest = DataStoreObject(manifest)
+    manifest = DataStoreObject(str(manifest))
 
     data = []
     try:
@@ -385,17 +386,18 @@ def read_manifest(manifest: str) -> List[dict]:
     return data
 
 
-def write_manifest(output_path: str, target_manifest: List[dict]):
+def write_manifest(output_path: Union[Path, str], target_manifest: List[dict], ensure_ascii: bool = True):
     """
     Write to manifest file
 
     Args:
-        output_path (str): Path to output manifest file
+        output_path (str or Path): Path to output manifest file
         target_manifest (list): List of manifest file entries
+        ensure_ascii (bool): default is True, meaning the output is guaranteed to have all incoming non-ASCII characters escaped. If ensure_ascii is false, these characters will be output as-is.
     """
-    with open(output_path, "w") as outfile:
+    with open(output_path, "w", encoding="utf-8") as outfile:
         for tgt in target_manifest:
-            json.dump(tgt, outfile)
+            json.dump(tgt, outfile, ensure_ascii=ensure_ascii)
             outfile.write('\n')
 
 

diff --git a/nemo/collections/tts/parts/utils/tts_dataset_utils.py b/nemo/collections/tts/parts/utils/tts_dataset_utils.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import functools
-import json
 import os
 from pathlib import Path
-from typing import List
 
 import numpy as np
 import torch
@@ -25,22 +23,6 @@
 from torch.special import gammaln
 
 
-def read_manifest(manifest_path: Path) -> List[dict]:
-    """Read manifest file at the given path and convert it to a list of dictionary entries.
-    """
-    with open(manifest_path, "r", encoding="utf-8") as manifest_f:
-        entries = [json.loads(line) for line in manifest_f]
-    return entries
-
-
-def write_manifest(manifest_path: Path, entries: List[dict]) -> None:
-    """Convert input entries to JSON format and write them as a manifest at the given path.
-    """
-    output_lines = [f"{json.dumps(entry, ensure_ascii=False)}\n" for entry in entries]
-    with open(manifest_path, "w", encoding="utf-8") as output_f:
-        output_f.writelines(output_lines)
-
-
 def get_sup_data_file_path(entry: dict, base_audio_path: Path, sup_data_path: Path) -> Path:
     audio_path = Path(entry["audio_filepath"])
     rel_audio_path = audio_path.relative_to(base_audio_path).with_suffix("")

diff --git a/scripts/dataset_processing/tts/audio_processing/preprocess_audio.py b/scripts/dataset_processing/tts/audio_processing/preprocess_audio.py
@@ -42,13 +42,9 @@
 from joblib import Parallel, delayed
 from tqdm import tqdm
 
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
 from nemo.collections.tts.parts.preprocessing.audio_trimming import AudioTrimmer
-from nemo.collections.tts.parts.utils.tts_dataset_utils import (
-    get_base_dir,
-    normalize_volume,
-    read_manifest,
-    write_manifest,
-)
+from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir, normalize_volume
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
@@ -119,8 +115,8 @@ def main(cfg):
     config = instantiate(cfg.config)
     logging.info(f"Running audio preprocessing with config: {config}")
 
-    input_manifest_path = Path(config.input_manifest)
-    output_manifest_path = Path(config.output_manifest)
+    input_manifest_path = config.input_manifest
+    output_manifest_path = config.output_manifest
     output_dir = Path(config.output_dir)
     num_workers = config.num_workers
     max_entries = config.max_entries
@@ -173,9 +169,9 @@ def main(cfg):
         output_durations += output_duration
         output_entries.append(output_entry)
 
-    write_manifest(manifest_path=output_manifest_path, entries=output_entries)
+    write_manifest(output_path=output_manifest_path, target_manifest=output_entries, ensure_ascii=False)
     if filter_file:
-        write_manifest(manifest_path=filter_file, entries=filtered_entries)
+        write_manifest(output_path=str(filter_file), target_manifest=filtered_entries, ensure_ascii=False)
 
     logging.info(f"Duration of original audio: {original_durations / 3600} hours")
     logging.info(f"Duration of processed audio: {output_durations / 3600} hours")

diff --git a/scripts/dataset_processing/tts/compute_speaker_stats.py b/scripts/dataset_processing/tts/compute_speaker_stats.py
@@ -34,7 +34,8 @@
 import torch
 from tqdm import tqdm
 
-from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir, get_sup_data_file_path, read_manifest
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
+from nemo.collections.tts.parts.utils.tts_dataset_utils import get_base_dir, get_sup_data_file_path
 from nemo.collections.tts.torch.tts_data_types import Pitch
 from nemo.utils import logging
 

diff --git a/scripts/dataset_processing/tts/resynthesize_dataset.py b/scripts/dataset_processing/tts/resynthesize_dataset.py
@@ -63,10 +63,10 @@
 from omegaconf import DictConfig, OmegaConf
 from tqdm import tqdm
 
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
 from nemo.collections.tts.models import FastPitchModel
 from nemo.collections.tts.models.base import SpectrogramGenerator
 from nemo.collections.tts.parts.utils.helpers import process_batch, to_device_recursive
-from nemo.collections.tts.parts.utils.tts_dataset_utils import read_manifest, write_manifest
 
 
 def chunks(iterable: Iterable, size: int) -> Iterator[List]:
@@ -198,7 +198,7 @@ def prepare_paired_mel_spectrograms(
             }
             output_manifest.append(new_manifest_entry)
 
-    write_manifest(output_json_manifest, output_manifest)
+    write_manifest(output_json_manifest, output_manifest, ensure_ascii=False)
 
 
 def argument_parser() -> argparse.ArgumentParser:

diff --git a/scripts/speaker_tasks/filelist_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py
@@ -43,6 +43,7 @@
 import sox
 from sklearn.model_selection import StratifiedShuffleSplit
 from tqdm.contrib.concurrent import process_map
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 
 random.seed(42)
 
@@ -145,15 +146,6 @@ def read_file(filelist, id=-1):
     return json_lines
 
 
-def read_manifest(manifest):
-    data = []
-    with open(manifest, 'r', encoding='utf-8') as f:
-        for line in f:
-            item = json.loads(line)
-            data.append(item)
-    return data
-
-
 def get_duration(json_line):
     dur = json_line['duration']
     if dur is None:

diff --git a/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py b/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py
@@ -22,6 +22,7 @@
 from joblib import Parallel, delayed
 from scipy.io import wavfile
 from tqdm import tqdm
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 
 parser = argparse.ArgumentParser(description='Create synthetic code-switching data audio data from monolingual data')
 parser.add_argument("--manifest_path", default=None, type=str, help='Path to CS indermediate manifest', required=True)
@@ -64,23 +65,6 @@
 args = parser.parse_args()
 
 
-def read_manifest(manifest_path: str):
-    """
-    Args:
-        manifest_path: absolute path of the manifest file
-
-    Returns:
-        List with manifest entires as elements
-
-    """
-    data = []
-
-    for line in open(manifest_path, 'r'):
-        data.append(json.loads(line))
-
-    return data
-
-
 def split_list(input_list: list, num_splits: int):
     """
     Args:

diff --git a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import argparse
-import json
 import logging
 import os
 import random
+from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
 
 # Checks -
 # (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation)
@@ -43,39 +43,6 @@
 args = parser.parse_args()
 
 
-def read_manifest(manifest_path: str):
-    """
-    Args:
-        manifest_path: absolute path of the manifest file
-
-    Returns:
-        List with manifest entires as elements
-
-    """
-    data = []
-
-    for line in open(manifest_path, 'r'):
-        data.append(json.loads(line))
-
-    return data
-
-
-def write_manifest(manifest_path: str, data: list):
-    """
-    Args:
-        manifest_path: absolute path for where to save the manifest file
-        data: list consisting of entries for the manifest
-
-    Returns:
-
-    """
-
-    with open(manifest_path, 'w') as outfile:
-        for elem in data:
-            s = json.dumps(elem)
-            outfile.write(s + '\n')
-
-
 def create_cs_manifest(
     data_lang_0: list,
     data_lang_1: list,

diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
@@ -268,28 +268,17 @@
       "source": [
         "# Manifest Utils\n",
         "from tqdm.auto import tqdm\n",
+        "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest\n",
         "import json\n",
         "\n",
-        "def read_manifest(path):\n",
-        "    manifest = []\n",
-        "    with open(path, 'r') as f:\n",
-        "        for line in tqdm(f, desc=\"Reading manifest data\"):\n",
-        "            line = line.replace(\"\\n\", \"\")\n",
-        "            data = json.loads(line)\n",
-        "            manifest.append(data)\n",
-        "    return manifest\n",
-        "\n",
         "\n",
         "def write_processed_manifest(data, original_path):\n",
         "    original_manifest_name = os.path.basename(original_path)\n",
         "    new_manifest_name = original_manifest_name.replace(\".json\", \"_processed.json\")\n",
         "\n",
         "    manifest_dir = os.path.split(original_path)[0]\n",
         "    filepath = os.path.join(manifest_dir, new_manifest_name)\n",
-        "    with open(filepath, 'w') as f:\n",
-        "        for datum in tqdm(data, desc=\"Writing manifest data\"):\n",
-        "            datum = json.dumps(datum)\n",
-        "            f.write(f\"{datum}\\n\")\n",
+        "    write_manifest(filepath, data)\n",
         "    print(f\"Finished writing manifest: {filepath}\")\n",
         "    return filepath"
       ],
@@ -2164,4 +2153,4 @@
       ]
     }
   ]
-}
+}