NVIDIA · titu1994 · Apr 18, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023
diff --git a/nemo/collections/asr/data/audio_to_label.py b/nemo/collections/asr/data/audio_to_label.py
@@ -19,7 +19,7 @@
 import torch
 import webdataset as wd
 
-from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_audio_filepaths
+from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths
 from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats
 from nemo.collections.common.parts.preprocessing import collections
 from nemo.core.classes import Dataset, IterableDataset
@@ -560,8 +560,8 @@ def __init__(
         for idx in range(len(self.labels[:5])):
             logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx]))
 
-        audio_tar_filepaths = expand_audio_filepaths(
-            audio_tar_filepaths=audio_tar_filepaths,
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
             shard_strategy=shard_strategy,
             world_size=world_size,
             global_rank=global_rank,

diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
@@ -171,47 +171,48 @@ def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) -
         return t, tl
 
 
-def expand_audio_filepaths(audio_tar_filepaths, shard_strategy: str, world_size: int, global_rank: int):
+def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: int, global_rank: int):
     valid_shard_strategies = ['scatter', 'replicate']
     if shard_strategy not in valid_shard_strategies:
         raise ValueError(f"`shard_strategy` must be one of {valid_shard_strategies}")
 
-    if isinstance(audio_tar_filepaths, str):
+    if isinstance(sharded_filepaths, str):
         # Replace '(' and '[' with '{'
         brace_keys_open = ['(', '[', '<', '_OP_']
         for bkey in brace_keys_open:
-            if bkey in audio_tar_filepaths:
-                audio_tar_filepaths = audio_tar_filepaths.replace(bkey, "{")
+            if bkey in sharded_filepaths:
+                sharded_filepaths = sharded_filepaths.replace(bkey, "{")
 
         # Replace ')' and ']' with '}'
         brace_keys_close = [')', ']', '>', '_CL_']
         for bkey in brace_keys_close:
-            if bkey in audio_tar_filepaths:
-                audio_tar_filepaths = audio_tar_filepaths.replace(bkey, "}")
+            if bkey in sharded_filepaths:
+                sharded_filepaths = sharded_filepaths.replace(bkey, "}")
 
-    if isinstance(audio_tar_filepaths, str):
+    if isinstance(sharded_filepaths, str):
         # Brace expand
-        audio_tar_filepaths = list(braceexpand.braceexpand(audio_tar_filepaths))
+        sharded_filepaths = list(braceexpand.braceexpand(sharded_filepaths))
 
     # Expand store paths into WebDataset URLs
-    audio_tar_filepaths = [
-        datastore_path_to_webdataset_url(p) if is_datastore_path(p) else p for p in audio_tar_filepaths
+    sharded_filepaths = [
+        datastore_path_to_webdataset_url(p) if is_datastore_path(p) and is_tarred_path(p) else p
+        for p in sharded_filepaths
     ]
 
     # Check for distributed and partition shards accordingly
     if world_size > 1:
         if shard_strategy == 'scatter':
             logging.info("All tarred dataset shards will be scattered evenly across all nodes.")
 
-            if len(audio_tar_filepaths) % world_size != 0:
+            if len(sharded_filepaths) % world_size != 0:
                 logging.warning(
-                    f"Number of shards in tarred dataset ({len(audio_tar_filepaths)}) is not divisible "
+                    f"Number of shards in tarred dataset ({len(sharded_filepaths)}) is not divisible "
                     f"by number of distributed workers ({world_size})."
                 )
 
-            begin_idx = (len(audio_tar_filepaths) // world_size) * global_rank
-            end_idx = begin_idx + len(audio_tar_filepaths) // world_size
-            audio_tar_filepaths = audio_tar_filepaths[begin_idx:end_idx]
+            begin_idx = (len(sharded_filepaths) // world_size) * global_rank
+            end_idx = begin_idx + len(sharded_filepaths) // world_size
+            sharded_filepaths = sharded_filepaths[begin_idx:end_idx]
             logging.info(
                 "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx
             )
@@ -221,7 +222,7 @@ def expand_audio_filepaths(audio_tar_filepaths, shard_strategy: str, world_size:
         else:
             raise ValueError(f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}")
 
-    return audio_tar_filepaths
+    return sharded_filepaths
 
 
 def cache_datastore_manifests(
@@ -345,6 +346,47 @@ def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers):
             )
 
 
+"""Optionally expand / shard the list of manifests
+    This is made to use the same notation as the sharded audio files
+
+    Args:
+        manifest_filepaths: list of manifest files (the sharded notation)
+        shard_strategy: scatter or replicate (scatter by default)
+        shard_manifests: bool, if False, no sharding / manifest filepath expansion will be attempted
+        global_rank: int, the rank of this worker
+        world_size: int, total number of workers
+"""
+
+
+def shard_manifests_if_needed(
+    manifest_filepaths: Union[str, List[str]],
+    shard_strategy: str,
+    shard_manifests: bool,
+    global_rank: int,
+    world_size: int,
+):
+    if shard_manifests:
+        if not torch.distributed.is_available():
+            logging.warning("Not running in torch.distributed mode. Manifest sharding not available")
+            return manifest_filepaths
+
+        if not torch.distributed.is_initialized():
+            logging.warning(
+                'Manifest sharding was requested but torch.distributed is not initialized '
+                'Did you intend to set the defer_setup flag?'
+            )
+            return manifest_filepaths
+
+        manifest_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=manifest_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+    return manifest_filepaths
+
+
 class _AudioTextDataset(Dataset):
     """
     Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
@@ -748,6 +790,7 @@ class _TarredAudioToTextDataset(IterableDataset):
                     occasions (when the number of shards is not divisible with ``world_size``), will not sample
                     the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
                     or test datasets.
+        shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False.
         global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
         world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
         return_sample_id (bool): whether to return the sample_id as a part of each sample
@@ -769,10 +812,22 @@ def __init__(
         eos_id: Optional[int] = None,
         pad_id: int = 0,
         shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
         global_rank: int = 0,
         world_size: int = 0,
         return_sample_id: bool = False,
     ):
+        self.shard_manifests = shard_manifests
+
+        # Shard manifests if necessary and possible and then expand the paths
+        manifest_filepath = shard_manifests_if_needed(
+            shard_manifests=shard_manifests,
+            shard_strategy=shard_strategy,
+            manifest_filepaths=manifest_filepath,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
         # If necessary, cache manifests from object store
         cache_datastore_manifests(manifest_filepaths=manifest_filepath)
 
@@ -788,15 +843,17 @@ def __init__(
             index_by_file_id=True,  # Must set this so the manifest lines can be indexed by file ID
         )
 
+        self.len = self._compute_len()
+
         self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
         self.trim = trim
         self.eos_id = eos_id
         self.bos_id = bos_id
         self.pad_id = pad_id
         self.return_sample_id = return_sample_id
 
-        audio_tar_filepaths = expand_audio_filepaths(
-            audio_tar_filepaths=audio_tar_filepaths,
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
             shard_strategy=shard_strategy,
             world_size=world_size,
             global_rank=global_rank,
@@ -928,8 +985,19 @@ def get_manifest_sample(self, sample_id):
     def __iter__(self):
         return self._dataset.__iter__()
 
+    def _compute_len(self):
+        if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized():
+            my_len = torch.tensor(len(self.manifest_processor.collection), dtype=torch.int32).cuda()
+            torch.distributed.all_reduce(my_len)
+            my_len = my_len.int()
+            logging.info(f'Sharded manifests: Total length: {my_len}')
+        else:
+            my_len = len(self.manifest_processor.collection)
+
+        return my_len
+
     def __len__(self):
-        return len(self.manifest_processor.collection)
+        return self.len
 
 
 class TarredAudioToCharDataset(_TarredAudioToTextDataset):
@@ -1042,6 +1110,7 @@ def __init__(
         parser: Optional[str] = 'en',
         pad_id: int = 0,
         shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
         global_rank: int = 0,
         world_size: int = 0,
         return_sample_id: bool = False,
@@ -1067,6 +1136,7 @@ def __init__(
             eos_id=eos_id,
             pad_id=pad_id,
             shard_strategy=shard_strategy,
+            shard_manifests=shard_manifests,
             global_rank=global_rank,
             world_size=world_size,
             return_sample_id=return_sample_id,
@@ -1167,6 +1237,7 @@ def __init__(
         trim: bool = False,
         use_start_end_token: bool = True,
         shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
         global_rank: int = 0,
         world_size: int = 0,
         return_sample_id: bool = False,
@@ -1219,6 +1290,7 @@ def __call__(self, *args):
             eos_id=eos_id,
             pad_id=pad_id,
             shard_strategy=shard_strategy,
+            shard_manifests=shard_manifests,
             global_rank=global_rank,
             world_size=world_size,
             return_sample_id=return_sample_id,

diff --git a/nemo/collections/asr/data/audio_to_text_dali.py b/nemo/collections/asr/data/audio_to_text_dali.py
@@ -22,7 +22,7 @@
 import torch
 from omegaconf import DictConfig
 
-from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_audio_filepaths
+from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_sharded_filepaths
 from nemo.collections.common.parts.preprocessing import parsers
 from nemo.utils import logging, model_utils
 
@@ -345,10 +345,10 @@ def __init__(
                 self.is_tarred_dataset = False
 
             elif audio_tar_filepaths is not None and audio_tar_index_filepaths is not None:
-                audio_tar_filepaths = expand_audio_filepaths(
+                audio_tar_filepaths = expand_sharded_filepaths(
                     audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank
                 )
-                audio_tar_index_filepaths = expand_audio_filepaths(
+                audio_tar_index_filepaths = expand_sharded_filepaths(
                     audio_tar_index_filepaths,
                     shard_strategy=shard_strategy,
                     world_size=world_size,

diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py
@@ -346,6 +346,9 @@ def get_tarred_dataset(
     ):
         if len(tarred_audio_filepath) == 1:
             tarred_audio_filepath = tarred_audio_filepath[0]
+        if len(manifest_filepath) == 1:
+            manifest_filepath = manifest_filepath[0]
+
         if tokenizer is None:
             dataset = audio_to_text.TarredAudioToCharDataset(
                 audio_tar_filepaths=tarred_audio_filepath,
@@ -363,6 +366,7 @@ def get_tarred_dataset(
                 trim=config.get('trim_silence', False),
                 parser=config.get('parser', 'en'),
                 shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+                shard_manifests=config.get('shard_manifests', False),
                 global_rank=global_rank,
                 world_size=world_size,
                 return_sample_id=config.get('return_sample_id', False),
@@ -381,6 +385,7 @@ def get_tarred_dataset(
                 trim=config.get('trim_silence', False),
                 use_start_end_token=config.get('use_start_end_token', True),
                 shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+                shard_manifests=config.get('shard_manifests', False),
                 global_rank=global_rank,
                 world_size=world_size,
                 return_sample_id=config.get('return_sample_id', False),

diff --git a/nemo/collections/asr/models/configs/asr_models_config.py b/nemo/collections/asr/models/configs/asr_models_config.py
@@ -38,6 +38,7 @@ class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig):
     is_tarred: bool = False
     tarred_audio_filepaths: Optional[Any] = None
     tarred_shard_strategy: str = "scatter"
+    shard_manifests: bool = False
     shuffle_n: int = 0
 
     # Optional

diff --git a/nemo/utils/data_utils.py b/nemo/utils/data_utils.py
@@ -49,6 +49,12 @@ def is_datastore_path(path) -> bool:
     return path.startswith('ais://')
 
 
+def is_tarred_path(path) -> bool:
+    """Check if a path is for a tarred file.
+    """
+    return path.endswith('.tar')
+
+
 def is_datastore_cache_shared() -> bool:
     """Check if store cache is shared.
     """

diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
@@ -174,6 +174,11 @@
         "and it must be filled out by the user."
     ),
 )
+parser.add_argument(
+    "--no_shard_manifests",
+    action='store_true',
+    help="Do not write sharded manifests along with the aggregated manifest.",
+)
 parser.add_argument('--workers', type=int, default=1, help='Number of worker processes')
 args = parser.parse_args()
 
@@ -186,6 +191,7 @@ class ASRTarredDatasetConfig:
     min_duration: Optional[float] = None
     shuffle_seed: Optional[int] = None
     sort_in_shards: bool = True
+    shard_manifests: bool = True
     keep_files_together: bool = False
 
 
@@ -322,6 +328,19 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/",
                 for i, (start_idx, end_idx) in enumerate(zip(start_indices, end_indices))
             )
 
+        if config.shard_manifests:
+            sharded_manifests_dir = target_dir + '/sharded_manifests'
+            if not os.path.exists(sharded_manifests_dir):
+                os.makedirs(sharded_manifests_dir)
+
+            for manifest in new_entries_list:
+                shard_id = manifest[0]['shard_id']
+                new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json')
+                with open(new_manifest_shard_path, 'w') as m2:
+                    for entry in manifest:
+                        json.dump(entry, m2)
+                        m2.write('\n')
+
         # Flatten the list of list of entries to a list of entries
         new_entries = [sample for manifest in new_entries_list for sample in manifest]
         del new_entries_list
@@ -626,6 +645,8 @@ def main():
 def create_tar_datasets(min_duration: float, max_duration: float, target_dir: str):
     builder = ASRTarredDatasetBuilder()
 
+    shard_manifests = False if args.no_shard_manifests else True
+
     if args.write_metadata:
         metadata = ASRTarredDatasetMetadata()
         dataset_cfg = ASRTarredDatasetConfig(
@@ -635,6 +656,7 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st
             min_duration=min_duration,
             shuffle_seed=args.shuffle_seed,
             sort_in_shards=args.sort_in_shards,
+            shard_manifests=shard_manifests,
             keep_files_together=args.keep_files_together,
         )
         metadata.dataset_config = dataset_cfg
@@ -655,6 +677,7 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st
             min_duration=min_duration,
             shuffle_seed=args.shuffle_seed,
             sort_in_shards=args.sort_in_shards,
+            shard_manifests=shard_manifests,
             keep_files_together=args.keep_files_together,
         )
         builder.configure(config)
@@ -682,6 +705,7 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st
         metadata.dataset_config.shuffle = args.shuffle
         metadata.dataset_config.shuffle_seed = args.shuffle_seed
         metadata.dataset_config.sort_in_shards = args.sort_in_shards
+        metadata.dataset_config.shard_manifests = shard_manifests
 
         builder.configure(metadata.dataset_config)
 

diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -291,6 +291,7 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self):
             'pin_memory',
             'drop_last',
             'tarred_shard_strategy',
+            'shard_manifests',
             'shuffle_n',
             'parser',
             'normalize',