NVIDIA · arendu · Jul 20, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
@@ -116,6 +116,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 0
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -143,6 +144,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: False
       num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -170,6 +172,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: False
       num_workers: 4
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
@@ -82,6 +82,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -109,6 +110,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -137,6 +139,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import numpy as np
 import torch
 
@@ -40,12 +42,13 @@ def __init__(
         label_key: str = "answer",
         separate_prompt_and_response_with_newline: bool = False,
         answer_only_loss: bool = True,
-        truncation_field: str = "answer",
+        truncation_field: str = "context",
         pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
         index_mapping_dir: str = None,
         prompt_template: str = None,
         virtual_tokens: int = 0,
         tokens_to_generate: int = 0,
+        memmap_workers: Optional[int] = None,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -94,7 +97,11 @@ def __init__(
         assert self.truncation_field in ["answer", "context"]
 
         self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir
+            dataset_paths=[file_path],
+            tokenizer=None,
+            header_lines=0,
+            index_mapping_dir=index_mapping_dir,
+            workers=memmap_workers,
         )
 
         # Will be None after this call if `max_num_samples` is None

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -266,6 +266,9 @@ def _build_dataset(self, data_cfg, is_train=True):
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0
                 ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+                memmap_workers=data_cfg.get(
+                    'memmap_workers', None
+                ),  # used to set num. of workers to create the memmap index files
             )
             datasets.append(dataset)