Skip to content

Commit

Permalink
Include the scripts for preprocessing OAST and unit tests for chat sf…
Browse files Browse the repository at this point in the history
…t datasets (#7112)

* scripts for sft

Signed-off-by: Yi Dong <[email protected]>

* fix style

Signed-off-by: Yi Dong <[email protected]>

* adde special token only for huggingface model

Signed-off-by: Yi Dong <[email protected]>

* change default name

Signed-off-by: Yi Dong <[email protected]>

* print out error datapoint content

Signed-off-by: Yi Dong <[email protected]>

* show error id

Signed-off-by: Yi Dong <[email protected]>

* annotation script working

Signed-off-by: Yi Dong <[email protected]>

* try to be compatible with huggingface tokenizer

Signed-off-by: Yi Dong <[email protected]>

* added examples

Signed-off-by: Yi Dong <[email protected]>

* added lang

Signed-off-by: Yi Dong <[email protected]>

* added lang

Signed-off-by: Yi Dong <[email protected]>

* text to value special case

Signed-off-by: Yi Dong <[email protected]>

* configure the slider

Signed-off-by: Yi Dong <[email protected]>

* annoatation handles lang

Signed-off-by: Yi Dong <[email protected]>

* added the unit test for chat sft dataset

Signed-off-by: Yi Dong <[email protected]>

* used the file in the test dir

Signed-off-by: Yi Dong <[email protected]>

* fix json error

Signed-off-by: Yi Dong <[email protected]>

* load local tokenizer

Signed-off-by: Yi Dong <[email protected]>

* remove mask count check

Signed-off-by: Yi Dong <[email protected]>

* added HF dataset backend

Signed-off-by: Yi Dong <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Yi Dong <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: jubick1337 <[email protected]>
  • Loading branch information
2 people authored and jubick1337 committed Aug 8, 2023
1 parent aa852c3 commit 13c4542
Show file tree
Hide file tree
Showing 13 changed files with 1,064 additions and 69 deletions.
49 changes: 49 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,55 @@ web_port: 9889 # the port number of the web server
chat: False # use the chat interface
chatbot_config:
value: False # whether to inject the value attributes
attributes:
- name: Quality
min: 0
max: 4
key: quality
type: int
default: 4
- name: Toxicity
min: 0
max: 4
key: toxcity
type: int
default: 0
- name: Humor
min: 0
max: 4
key: humor
type: int
default: 0
- name: Creativity
min: 0
max: 4
key: creativity
type: int
default: 0
- name: Violence
min: 0
max: 4
key: violence
type: int
default: 0
- name: Helpfulness
min: 0
max: 4
key: helpfulness
type: int
default: 4
- name: Not_Appropriate
min: 0
max: 4
key: not_appropriate
type: int
default: 0
- name: Language
choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
key: lang
type: list
default: en

user: User
assistant: Assistant
system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
7 changes: 6 additions & 1 deletion examples/nlp/language_modeling/megatron_gpt_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,12 @@ def main(cfg) -> None:
'assistant': cfg.chatbot_config.assistant,
'system': cfg.chatbot_config.system,
}
web_ui = partial(get_chatbot_demo, defaults=defaults, value=cfg.chatbot_config.value)
web_ui = partial(
get_chatbot_demo,
defaults=defaults,
value=cfg.chatbot_config.value,
attributes=cfg.chatbot_config.attributes,
)
else:
web_ui = get_demo
loop = asyncio.new_event_loop()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ model:
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.

validation_ds:
file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
Expand All @@ -126,6 +127,7 @@ model:
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.

metric:
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
Expand Down Expand Up @@ -155,6 +157,7 @@ model:
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.

metric:
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,18 @@ def _mask_targets(
# target[cur_idx + 1:cur_idx + tokenized_len] skip the turn token
if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[1:]):
logging.warning("a sentence mismatches the corresponding piece " "in the conversation")
if i == 0:
if i == 0 and (gtype == 'VALUE_TO_TEXT' or gtype is None):
# mask the first turn completely to provide at least one turn as context
target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
elif speaker == mask_role:
elif speaker == mask_role and i == 1 and gtype == 'TEXT_TO_VALUE':
# leave the first human tag unmasked
target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX
elif speaker == mask_role and (i > 1):
# leave the first human tag unmasked
target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX
elif speaker == mask_role and (i <= 1):
# mask out everything in the second turn
target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
else:
# mask up to the name end, need to remove one as skip name has an extra artifact empty token
target[cur_idx : cur_idx + skip_name_len] = IGNORE_INDEX
Expand All @@ -109,6 +115,8 @@ def cannonical_form_formater(cannoical_form):
def response_value_formater(label):
if isinstance(label, str):
return '<extra_id_2>' + label + '\n'
elif label is None:
return ''
else:
raise ValueError(f'Unknown label type {type(label)}, only str type is supported')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@

import numpy as np
import torch
from datasets import load_dataset

from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
from nemo.core.classes import Dataset
from nemo.utils import logging

__all__ = ['GPTSFTDataset']

Expand Down Expand Up @@ -49,6 +51,7 @@ def __init__(
virtual_tokens: int = 0,
tokens_to_generate: int = 0,
memmap_workers: Optional[int] = None,
hf_dataset: bool = False,
):
"""
file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
Expand All @@ -70,6 +73,7 @@ def __init__(
pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
hf_dataset: Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
"""
self.tokenizer = tokenizer
self.file_path = file_path
Expand All @@ -96,13 +100,18 @@ def __init__(
self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
assert self.truncation_field in ["answer", "context"]

self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir,
workers=memmap_workers,
)
if hf_dataset:
self.indexed_dataset = load_dataset(
'json', data_files=file_path, cache_dir=index_mapping_dir, num_proc=memmap_workers, split='train'
)
else:
self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir,
workers=memmap_workers,
)

# Will be None after this call if `max_num_samples` is None
self._build_samples_mapping()
Expand Down Expand Up @@ -141,7 +150,11 @@ def __getitem__(self, idx):
idx = idx.item()

assert idx < len(self.indexed_dataset)
example = self.indexed_dataset[idx]
try:
example = self.indexed_dataset[idx]
except Exception as e:
logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
raise e
return self._process_example(example)

def _process_example(self, example):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,13 @@ def __init__(

def _build_data_from_text(self, text):
"""Return a dictionary of data based on a single JSON line."""
return json.loads(text)
try:
record = json.loads(text)
except Exception as e:
logging.error(f"Exception: {e}")
logging.error(f"datapoint: {text}")
raise e
return record


def _index_file_exists(idx_fn):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ def _build_dataset(self, data_cfg, is_train=True):
memmap_workers=data_cfg.get(
'memmap_workers', None
), # used to set num. of workers to create the memmap index files
hf_dataset=data_cfg.get(
'hf_dataset', False
), # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
)
datasets.append(dataset)

Expand Down
76 changes: 19 additions & 57 deletions nemo/collections/nlp/modules/common/megatron_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def clear_fun():


def get_chatbot_demo(
share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None
share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None, attributes=None,
):
check_gradio_import()
from nemo.collections.nlp.modules.common.chatbot_component import Chatbot
Expand Down Expand Up @@ -222,28 +222,20 @@ def get_chatbot_demo(
)

with gr.Accordion("Value Parameters", open=True, visible=value):
keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
quality_value = gr.Slider(
minimum=0, maximum=9, step=1, value=9, label='Quality', interactive=True, visible=True
)
toxicity_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Toxicity', interactive=True, visible=True
)
humor_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Humor', interactive=True, visible=True
)
creativity_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Creativity', interactive=True, visible=True
)
violence_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Violence', interactive=True, visible=True
)
helpfulness_value = gr.Slider(
minimum=0, maximum=9, step=1, value=9, label='Helpfulness', interactive=True, visible=True
)
not_appropriate_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Not Appropriate', interactive=True, visible=True
)
keys = [k.key for k in attributes]
# keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
widgets = []
for item in attributes:
if item.type == 'int':
slider = gr.Slider(
minimum=item.min, maximum=item.max, step=1, value=item.default, label=item.name
)
widgets.append(slider)
elif item.type == 'list':
dropdown = gr.Dropdown(
item.choices, label=item.name, default=item.default, value=item.default
)
widgets.append(dropdown)
used_value = gr.CheckboxGroup(keys, value=keys)

def change_visibility(x):
Expand All @@ -256,17 +248,7 @@ def change_visibility(x):
return values

used_value.change(
change_visibility,
inputs=[used_value],
outputs=[
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
],
change_visibility, inputs=[used_value], outputs=widgets,
)

def set_sampling(x):
Expand Down Expand Up @@ -328,25 +310,11 @@ def bot(
assistant_name,
session_state,
prompts_presets,
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
used_value,
*values,
):

values_array = [
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
]
values_array = values
if value:
value_str = get_value_str(values_array, used_value)
else:
Expand Down Expand Up @@ -400,14 +368,8 @@ def bot(
assistant_name,
session_state,
prompt_presets,
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
used_value,
*widgets,
],
[chatbot],
)
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements_nlp.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
boto3
datasets
einops
faiss-cpu
fasttext
Expand Down
Loading

0 comments on commit 13c4542

Please sign in to comment.