From c40c80a6f516637d53900a4d4fe647a6d78aacd9 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Tue, 18 Jul 2023 11:38:36 -0700
Subject: [PATCH 1/6] memmap worker arg

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../tuning/conf/megatron_gpt_peft_tuning_config.yaml           | 3 +++
 .../nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml    | 3 +++
 .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py     | 3 ++-
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py     | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
index 799d105aae7c..4c54cc90ca10 100755
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
@@ -116,6 +116,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 0
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -143,6 +144,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: False
       num_workers: 0
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -170,6 +172,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: False
       num_workers: 4
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
index 0e3f0d712dd6..ffaeb29e9911 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
@@ -82,6 +82,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -109,6 +110,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -137,6 +139,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
+      memmap_workers: null
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index 756494f2f315..b2fd02238fb8 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -46,6 +46,7 @@ def __init__(
         prompt_template: str = None,
         virtual_tokens: int = 0,
         tokens_to_generate: int = 0,
+        memmap_workers: Optional[int] = None,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -94,7 +95,7 @@ def __init__(
         assert self.truncation_field in ["answer", "context"]
 
         self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir
+            dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir, workers=memmap_workers
         )
 
         # Will be None after this call if `max_num_samples` is None
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index c390a8c440bf..a5bc8004229e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -266,6 +266,7 @@ def _build_dataset(self, data_cfg, is_train=True):
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0
                 ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+                memmap_workers=data_cfg.get('memmap_workers', None)  # used to set num. of workers to create the memmap index files
             )
             datasets.append(dataset)
 

From c9d3d7fb99578631c35f830bde34df96d599bba0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Jul 2023 18:40:13 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py  | 6 +++++-
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py  | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index b2fd02238fb8..08055cccf878 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -95,7 +95,11 @@ def __init__(
         assert self.truncation_field in ["answer", "context"]
 
         self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir, workers=memmap_workers
+            dataset_paths=[file_path],
+            tokenizer=None,
+            header_lines=0,
+            index_mapping_dir=index_mapping_dir,
+            workers=memmap_workers,
         )
 
         # Will be None after this call if `max_num_samples` is None
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index a5bc8004229e..95108e90f087 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -266,7 +266,9 @@ def _build_dataset(self, data_cfg, is_train=True):
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0
                 ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
-                memmap_workers=data_cfg.get('memmap_workers', None)  # used to set num. of workers to create the memmap index files
+                memmap_workers=data_cfg.get(
+                    'memmap_workers', None
+                ),  # used to set num. of workers to create the memmap index files
             )
             datasets.append(dataset)
 

From 4d5158b3a5fcded61892e3b27d48db94d722ca42 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Tue, 18 Jul 2023 16:26:21 -0700
Subject: [PATCH 3/6] update

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml   | 4 ++--
 .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py    | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
index ffaeb29e9911..f15138c99264 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
@@ -110,7 +110,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
-      memmap_workers: null
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -139,7 +139,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: True
       num_workers: 4
-      memmap_workers: null
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index b2fd02238fb8..835dc702057a 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import torch
+from typing import Optional
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping

From 72d0bb6104faee3d71c59ef7e69ac30685623d70 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Jul 2023 23:27:34 +0000
Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index 33f3c73ef06a..e9c7be8ce65c 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import numpy as np
 import torch
-from typing import Optional
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping

From d73374e9b12168cc0828a244d8c9a882b8fda80a Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Tue, 18 Jul 2023 16:27:58 -0700
Subject: [PATCH 5/6] update

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../tuning/conf/megatron_gpt_peft_tuning_config.yaml          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
index 4c54cc90ca10..d26dd2922088 100755
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml
@@ -144,7 +144,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: False
       num_workers: 0
-      memmap_workers: null
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1
@@ -172,7 +172,7 @@ model:
       micro_batch_size: ${model.micro_batch_size}
       shuffle: False
       num_workers: 4
-      memmap_workers: null
+      memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
       max_seq_length: 2048
       min_seq_length: 1

From 3d625bba708104640412f33f2ee2974a1c1d251d Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 19 Jul 2023 10:04:06 -0700
Subject: [PATCH 6/6] update

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index e9c7be8ce65c..da3d03199c2e 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -42,7 +42,7 @@ def __init__(
         label_key: str = "answer",
         separate_prompt_and_response_with_newline: bool = False,
         answer_only_loss: bool = True,
-        truncation_field: str = "answer",
+        truncation_field: str = "context",
         pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
         index_mapping_dir: str = None,
         prompt_template: str = None,