Include the scripts for preprocessing OAST and unit tests for chat sf…

…t datasets (#7112) * scripts for sft Signed-off-by: Yi Dong <[email protected]> * fix style Signed-off-by: Yi Dong <[email protected]> * adde special token only for huggingface model Signed-off-by: Yi Dong <[email protected]> * change default name Signed-off-by: Yi Dong <[email protected]> * print out error datapoint content Signed-off-by: Yi Dong <[email protected]> * show error id Signed-off-by: Yi Dong <[email protected]> * annotation script working Signed-off-by: Yi Dong <[email protected]> * try to be compatible with huggingface tokenizer Signed-off-by: Yi Dong <[email protected]> * added examples Signed-off-by: Yi Dong <[email protected]> * added lang Signed-off-by: Yi Dong <[email protected]> * added lang Signed-off-by: Yi Dong <[email protected]> * text to value special case Signed-off-by: Yi Dong <[email protected]> * configure the slider Signed-off-by: Yi Dong <[email protected]> * annoatation handles lang Signed-off-by: Yi Dong <[email protected]> * added the unit test for chat sft dataset Signed-off-by: Yi Dong <[email protected]> * used the file in the test dir Signed-off-by: Yi Dong <[email protected]> * fix json error Signed-off-by: Yi Dong <[email protected]> * load local tokenizer Signed-off-by: Yi Dong <[email protected]> * remove mask count check Signed-off-by: Yi Dong <[email protected]> * added HF dataset backend Signed-off-by: Yi Dong <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Yi Dong <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: jubick1337 <[email protected]>
NVIDIA · Aug 8, 2023 · 13c4542 · 13c4542
1 parent aa852c3
commit 13c4542
Show file tree

Hide file tree

Showing 13 changed files with 1,064 additions and 69 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
@@ -38,6 +38,55 @@ web_port: 9889 # the port number of the web server
 chat: False # use the chat interface
 chatbot_config:
   value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+
   user: User
   assistant: Assistant
   system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -314,7 +314,12 @@ def main(cfg) -> None:
                         'assistant': cfg.chatbot_config.assistant,
                         'system': cfg.chatbot_config.system,
                     }
-                    web_ui = partial(get_chatbot_demo, defaults=defaults, value=cfg.chatbot_config.value)
+                    web_ui = partial(
+                        get_chatbot_demo,
+                        defaults=defaults,
+                        value=cfg.chatbot_config.value,
+                        attributes=cfg.chatbot_config.attributes,
+                    )
                 else:
                     web_ui = get_demo
                 loop = asyncio.new_event_loop()

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
@@ -102,6 +102,7 @@ model:
       truncation_field: "context" # Options: ['context', 'answer']
       index_mapping_dir: null # Path to a directory to write index mapping files.
       prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
 
     validation_ds:
       file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
@@ -126,6 +127,7 @@ model:
       truncation_field: "context" # Options: ['context', 'answer']
       index_mapping_dir: null # Path to a directory to write index mapping files.
       prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
 
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
@@ -155,6 +157,7 @@ model:
       truncation_field: "context" # Options: ['context', 'answer']
       index_mapping_dir: null # Path to a directory to write index mapping files.
       prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
 
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
@@ -90,12 +90,18 @@ def _mask_targets(
             # target[cur_idx + 1:cur_idx + tokenized_len] skip the turn token
             if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[1:]):
                 logging.warning("a sentence mismatches the corresponding piece " "in the conversation")
-        if i == 0:
+        if i == 0 and (gtype == 'VALUE_TO_TEXT' or gtype is None):
             # mask the first turn completely to provide at least one turn as context
             target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
-        elif speaker == mask_role:
+        elif speaker == mask_role and i == 1 and gtype == 'TEXT_TO_VALUE':
             # leave the first human tag unmasked
             target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX
+        elif speaker == mask_role and (i > 1):
+            # leave the first human tag unmasked
+            target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX
+        elif speaker == mask_role and (i <= 1):
+            # mask out everything in the second turn
+            target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
         else:
             # mask up to the name end, need to remove one as skip name has an extra artifact empty token
             target[cur_idx : cur_idx + skip_name_len] = IGNORE_INDEX
@@ -109,6 +115,8 @@ def cannonical_form_formater(cannoical_form):
 def response_value_formater(label):
     if isinstance(label, str):
         return '<extra_id_2>' + label + '\n'
+    elif label is None:
+        return ''
     else:
         raise ValueError(f'Unknown label type {type(label)}, only str type is supported')
 

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -16,11 +16,13 @@
 
 import numpy as np
 import torch
+from datasets import load_dataset
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
 from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
 from nemo.core.classes import Dataset
+from nemo.utils import logging
 
 __all__ = ['GPTSFTDataset']
 
@@ -49,6 +51,7 @@ def __init__(
         virtual_tokens: int = 0,
         tokens_to_generate: int = 0,
         memmap_workers: Optional[int] = None,
+        hf_dataset: bool = False,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -70,6 +73,7 @@ def __init__(
         pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
         index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
         prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        hf_dataset: Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
         """
         self.tokenizer = tokenizer
         self.file_path = file_path
@@ -96,13 +100,18 @@ def __init__(
             self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
         assert self.truncation_field in ["answer", "context"]
 
-        self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path],
-            tokenizer=None,
-            header_lines=0,
-            index_mapping_dir=index_mapping_dir,
-            workers=memmap_workers,
-        )
+        if hf_dataset:
+            self.indexed_dataset = load_dataset(
+                'json', data_files=file_path, cache_dir=index_mapping_dir, num_proc=memmap_workers, split='train'
+            )
+        else:
+            self.indexed_dataset = JSONLMemMapDataset(
+                dataset_paths=[file_path],
+                tokenizer=None,
+                header_lines=0,
+                index_mapping_dir=index_mapping_dir,
+                workers=memmap_workers,
+            )
 
         # Will be None after this call if `max_num_samples` is None
         self._build_samples_mapping()
@@ -141,7 +150,11 @@ def __getitem__(self, idx):
                 idx = idx.item()
 
         assert idx < len(self.indexed_dataset)
-        example = self.indexed_dataset[idx]
+        try:
+            example = self.indexed_dataset[idx]
+        except Exception as e:
+            logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
+            raise e
         return self._process_example(example)
 
     def _process_example(self, example):

diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
@@ -371,7 +371,13 @@ def __init__(
 
     def _build_data_from_text(self, text):
         """Return a dictionary of data based on a single JSON line."""
-        return json.loads(text)
+        try:
+            record = json.loads(text)
+        except Exception as e:
+            logging.error(f"Exception: {e}")
+            logging.error(f"datapoint: {text}")
+            raise e
+        return record
 
 
 def _index_file_exists(idx_fn):

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -277,6 +277,9 @@ def _build_dataset(self, data_cfg, is_train=True):
                 memmap_workers=data_cfg.get(
                     'memmap_workers', None
                 ),  # used to set num. of workers to create the memmap index files
+                hf_dataset=data_cfg.get(
+                    'hf_dataset', False
+                ),  # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
             )
             datasets.append(dataset)
 

diff --git a/nemo/collections/nlp/modules/common/megatron_web_server.py b/nemo/collections/nlp/modules/common/megatron_web_server.py
@@ -190,7 +190,7 @@ def clear_fun():
 
 
 def get_chatbot_demo(
-    share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None
+    share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None, attributes=None,
 ):
     check_gradio_import()
     from nemo.collections.nlp.modules.common.chatbot_component import Chatbot
@@ -222,28 +222,20 @@ def get_chatbot_demo(
                     )
 
                 with gr.Accordion("Value Parameters", open=True, visible=value):
-                    keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
-                    quality_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=9, label='Quality', interactive=True, visible=True
-                    )
-                    toxicity_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=0, label='Toxicity', interactive=True, visible=True
-                    )
-                    humor_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=0, label='Humor', interactive=True, visible=True
-                    )
-                    creativity_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=0, label='Creativity', interactive=True, visible=True
-                    )
-                    violence_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=0, label='Violence', interactive=True, visible=True
-                    )
-                    helpfulness_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=9, label='Helpfulness', interactive=True, visible=True
-                    )
-                    not_appropriate_value = gr.Slider(
-                        minimum=0, maximum=9, step=1, value=0, label='Not Appropriate', interactive=True, visible=True
-                    )
+                    keys = [k.key for k in attributes]
+                    # keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
+                    widgets = []
+                    for item in attributes:
+                        if item.type == 'int':
+                            slider = gr.Slider(
+                                minimum=item.min, maximum=item.max, step=1, value=item.default, label=item.name
+                            )
+                            widgets.append(slider)
+                        elif item.type == 'list':
+                            dropdown = gr.Dropdown(
+                                item.choices, label=item.name, default=item.default, value=item.default
+                            )
+                            widgets.append(dropdown)
                     used_value = gr.CheckboxGroup(keys, value=keys)
 
                     def change_visibility(x):
@@ -256,17 +248,7 @@ def change_visibility(x):
                         return values
 
                     used_value.change(
-                        change_visibility,
-                        inputs=[used_value],
-                        outputs=[
-                            quality_value,
-                            toxicity_value,
-                            humor_value,
-                            creativity_value,
-                            violence_value,
-                            helpfulness_value,
-                            not_appropriate_value,
-                        ],
+                        change_visibility, inputs=[used_value], outputs=widgets,
                     )
 
                 def set_sampling(x):
@@ -328,25 +310,11 @@ def bot(
                     assistant_name,
                     session_state,
                     prompts_presets,
-                    quality_value,
-                    toxicity_value,
-                    humor_value,
-                    creativity_value,
-                    violence_value,
-                    helpfulness_value,
-                    not_appropriate_value,
                     used_value,
+                    *values,
                 ):
 
-                    values_array = [
-                        quality_value,
-                        toxicity_value,
-                        humor_value,
-                        creativity_value,
-                        violence_value,
-                        helpfulness_value,
-                        not_appropriate_value,
-                    ]
+                    values_array = values
                     if value:
                         value_str = get_value_str(values_array, used_value)
                     else:
@@ -400,14 +368,8 @@ def bot(
                         assistant_name,
                         session_state,
                         prompt_presets,
-                        quality_value,
-                        toxicity_value,
-                        humor_value,
-                        creativity_value,
-                        violence_value,
-                        helpfulness_value,
-                        not_appropriate_value,
                         used_value,
+                        *widgets,
                     ],
                     [chatbot],
                 )

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
@@ -1,4 +1,5 @@
 boto3
+datasets
 einops
 faiss-cpu
 fasttext