diff --git a/.gitignore b/.gitignore index 0e75bdf69..faf4e2aff 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ build/ develop-eggs/ dist/ downloads/ +applications/DeepSpeed-Chat/data eggs/ .eggs/ lib/ diff --git a/applications/DeepSpeed-Chat/README.md b/applications/DeepSpeed-Chat/README.md index c44ed45e0..d2152958d 100644 --- a/applications/DeepSpeed-Chat/README.md +++ b/applications/DeepSpeed-Chat/README.md @@ -240,9 +240,9 @@ If you have downloaded huggingface datasets manually, you can add your local pat One thing to note that some datasets may only have one response instead of two responses. For those datasets, you can only use them in step 1. And in such case, you should add the dataset_name as part of the "--sft_only_data_path" arg instead of the "--data_path" arg. One thing to note is that: If you plan to only do step 1 SFT, adding more single-response datasets is definitely beneficial. However, if you do plan to do steps 2 and 3, then adding too many single-response datasets during SFT could backfire: these data could be different from the data used for steps 2/3, generating different distributions which could cause training instability/worse model quality during step 2/3. That is part of the reason why we focused on trying the datasets with two responses and the preference, and always split a dataset into all 3 steps. If you have your own dataset in local files, you can also use it by following these rules: -* Use "local/jsonfile" as the dataset name. +* Pass "local/jsonfile" as the dataset name to the "--data_path" argument. * Put your train data and evaluation data in applications/DeepSpeed-Chat/data/ with name train.json and eval.json. -* The json data in file should be a single list with each item like ***{"prompt":"\n\nHuman:aaa.\n\nAssistant:\n\n","chosen":"bbb","rejected":"ccc"}***. +* The json data in file should be a single list with each item like ***{"prompt":"Human:I have a question.Assistant:","chosen":"Good answer.","rejected":"Bad answer."}***. What is more, when you use your own dataset files and modified some data in them, pay attention to the parameter "reload" of ***create_prompt_dataset*** function. You should pass a True value to it or the cache files will not refresh. diff --git a/applications/DeepSpeed-Chat/data/.gitignore b/applications/DeepSpeed-Chat/data/.gitignore deleted file mode 100644 index 94a2dd146..000000000 --- a/applications/DeepSpeed-Chat/data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.json \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py index 15c283898..10740ccad 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/training/utils/data/data_utils.py @@ -66,9 +66,9 @@ def get_raw_dataset(dataset_name, output_path, seed, local_rank): dataset_name) elif "local/jsonfile" in dataset_name: chat_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)) - if not os.path.isfile(chat_path + '/data/train.json'): + if not (os.path.isfile(chat_path + '/data/train.json') or os.path.isfile(chat_path + '/data/eval.json')): raise RuntimeError( - f"Please check both the train.json and eval.json files in the applications/DeepSpeed-Chat/data directory" + f"Please check both the train.json and eval.json files in your applications/DeepSpeed-Chat/data directory" ) return raw_datasets.LocalJsonFileDataset(output_path, seed, local_rank, dataset_name, chat_path) @@ -92,7 +92,7 @@ def get_raw_dataset_split_index(local_rank, output_path, dataset_name, seed, split_name, data_split, split_index, data_size): index_file_name = f"{output_path}/{dataset_name}_seed{seed}_{split_name}_{data_split}_{split_index}.npy" - # reindex each time since json file is more likely get modified + # reindex each time when using local jsonfile since it's more likely to get modified if (not os.path.isfile(index_file_name)) or (dataset_name == 'jsonfile'): splits = [float(s) for s in data_split.split(',')] splits_sum = sum(splits) diff --git a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py index 0f953df53..435c77b76 100644 --- a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py +++ b/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py @@ -341,7 +341,6 @@ def get_prompt_and_rejected(self, sample): ) return None -# Chinese dataset class LocalJsonFileDataset(PromptRawDataset): def __init__(self, output_path, seed, local_rank, dataset_name, chat_path): super().__init__(output_path, seed, local_rank, dataset_name)