From c40c80a6f516637d53900a4d4fe647a6d78aacd9 Mon Sep 17 00:00:00 2001 From: arendu Date: Tue, 18 Jul 2023 11:38:36 -0700 Subject: [PATCH 1/6] memmap worker arg Signed-off-by: arendu --- .../tuning/conf/megatron_gpt_peft_tuning_config.yaml | 3 +++ .../nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml | 3 +++ .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py | 3 ++- .../nlp/models/language_modeling/megatron_gpt_sft_model.py | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml index 799d105aae7c..4c54cc90ca10 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml @@ -116,6 +116,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 0 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -143,6 +144,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: False num_workers: 0 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -170,6 +172,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: False num_workers: 4 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml index 0e3f0d712dd6..ffaeb29e9911 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml @@ -82,6 +82,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -109,6 +110,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -137,6 +139,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 + memmap_workers: null pin_memory: True max_seq_length: 2048 min_seq_length: 1 diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 756494f2f315..b2fd02238fb8 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -46,6 +46,7 @@ def __init__( prompt_template: str = None, virtual_tokens: int = 0, tokens_to_generate: int = 0, + memmap_workers: Optional[int] = None, ): """ file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} @@ -94,7 +95,7 @@ def __init__( assert self.truncation_field in ["answer", "context"] self.indexed_dataset = JSONLMemMapDataset( - dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir + dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir, workers=memmap_workers ) # Will be None after this call if `max_num_samples` is None diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index c390a8c440bf..a5bc8004229e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -266,6 +266,7 @@ def _build_dataset(self, data_cfg, is_train=True): tokens_to_generate=data_cfg.get( 'tokens_to_generate', 0 ), # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure. + memmap_workers=data_cfg.get('memmap_workers', None) # used to set num. of workers to create the memmap index files ) datasets.append(dataset) From c9d3d7fb99578631c35f830bde34df96d599bba0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 18:40:13 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py | 6 +++++- .../nlp/models/language_modeling/megatron_gpt_sft_model.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index b2fd02238fb8..08055cccf878 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -95,7 +95,11 @@ def __init__( assert self.truncation_field in ["answer", "context"] self.indexed_dataset = JSONLMemMapDataset( - dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir, workers=memmap_workers + dataset_paths=[file_path], + tokenizer=None, + header_lines=0, + index_mapping_dir=index_mapping_dir, + workers=memmap_workers, ) # Will be None after this call if `max_num_samples` is None diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index a5bc8004229e..95108e90f087 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -266,7 +266,9 @@ def _build_dataset(self, data_cfg, is_train=True): tokens_to_generate=data_cfg.get( 'tokens_to_generate', 0 ), # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure. - memmap_workers=data_cfg.get('memmap_workers', None) # used to set num. of workers to create the memmap index files + memmap_workers=data_cfg.get( + 'memmap_workers', None + ), # used to set num. of workers to create the memmap index files ) datasets.append(dataset) From 4d5158b3a5fcded61892e3b27d48db94d722ca42 Mon Sep 17 00:00:00 2001 From: arendu Date: Tue, 18 Jul 2023 16:26:21 -0700 Subject: [PATCH 3/6] update Signed-off-by: arendu --- .../nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml | 4 ++-- .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml index ffaeb29e9911..f15138c99264 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml @@ -110,7 +110,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 - memmap_workers: null + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -139,7 +139,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: True num_workers: 4 - memmap_workers: null + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index b2fd02238fb8..835dc702057a 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -14,6 +14,7 @@ import numpy as np import torch +from typing import Optional from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping From 72d0bb6104faee3d71c59ef7e69ac30685623d70 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 23:27:34 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 33f3c73ef06a..e9c7be8ce65c 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + import numpy as np import torch -from typing import Optional from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping From d73374e9b12168cc0828a244d8c9a882b8fda80a Mon Sep 17 00:00:00 2001 From: arendu Date: Tue, 18 Jul 2023 16:27:58 -0700 Subject: [PATCH 5/6] update Signed-off-by: arendu --- .../tuning/conf/megatron_gpt_peft_tuning_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml index 4c54cc90ca10..d26dd2922088 100755 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml @@ -144,7 +144,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: False num_workers: 0 - memmap_workers: null + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -172,7 +172,7 @@ model: micro_batch_size: ${model.micro_batch_size} shuffle: False num_workers: 4 - memmap_workers: null + memmap_workers: ${model.data.train_ds.memmap_workers} pin_memory: True max_seq_length: 2048 min_seq_length: 1 From 3d625bba708104640412f33f2ee2974a1c1d251d Mon Sep 17 00:00:00 2001 From: arendu Date: Wed, 19 Jul 2023 10:04:06 -0700 Subject: [PATCH 6/6] update Signed-off-by: arendu --- .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index e9c7be8ce65c..da3d03199c2e 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -42,7 +42,7 @@ def __init__( label_key: str = "answer", separate_prompt_and_response_with_newline: bool = False, answer_only_loss: bool = True, - truncation_field: str = "answer", + truncation_field: str = "context", pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings. index_mapping_dir: str = None, prompt_template: str = None,