Fix custom dataset (modelscope#736)

Jintao-Huang · web-flow · commit 94349d3e1caf · 2024-04-18T17:42:42.000+08:00
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,7 +2,7 @@
 - [ ] Bug Fix
 - [ ] New Feature
 - [ ] Document Updates
-- [ ] More Model or Dataset Support
+- [ ] More Models or Datasets Support
 
 # PR information
 
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -186,8 +186,8 @@ dpo参数继承了sft参数, 除此之外增加了以下参数:
 - `--max_length`: 默认值为`-1`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--truncation_strategy`: 默认是`'delete'`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--check_dataset_strategy`: 默认值为`'none'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
-- `--custom_train_dataset_path`: 默认值为`[]`. 具体的含义参考README.md中的`自定义数据集`模块.
-- `--custom_val_dataset_path`: 默认值为`[]`. 具体的含义参考README.md中的`自定义数据集`模块.
+- `--custom_train_dataset_path`: 默认值为`[]`. 具体的含义参考[自定义与拓展](自定义与拓展.md).
+- `--custom_val_dataset_path`: 默认值为`[]`. 具体的含义参考[自定义与拓展](自定义与拓展.md).
 - `--quantization_bit`: 默认值为0. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--bnb_4bit_comp_dtype`: 默认值为`'AUTO'`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
 - `--bnb_4bit_quant_type`: 默认值为`'nf4'`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
diff --git a/docs/source_en/LLM/Command-line-parameters.md b/docs/source_en/LLM/Command-line-parameters.md
@@ -186,8 +186,8 @@ dpo parameters inherit from sft parameters, with the following added parameters:
 - `--max_length`: Default is `-1`. See `sft.sh command line arguments` for parameter details.
 - `--truncation_strategy`: Default is `'delete'`. See `sft.sh command line arguments` for parameter details.
 - `--check_dataset_strategy`: Default is `'none'`, see `sft.sh command line arguments` for parameter details.
-- `--custom_train_dataset_path`: Default is `[]`. See README.md `Custom Datasets` module for details.
-- `--custom_val_dataset_path`: Default is `[]`. See README.md `Custom Datasets` module for details.
+- `--custom_train_dataset_path`: Default is `[]`. See [Customization](Customization.md) for details.
+- `--custom_val_dataset_path`: Default is `[]`. See [Customization](Customization.md) for details.
 - `--quantization_bit`: Default is 0. See `sft.sh command line arguments` for parameter details.
 - `--bnb_4bit_comp_dtype`: Default is `'AUTO'`.  See `sft.sh command line arguments` for parameter details. If `quantization_bit` is set to 0, this parameter has no effect.
 - `--bnb_4bit_quant_type`: Default is `'nf4'`.  See `sft.sh command line arguments` for parameter details. If `quantization_bit` is set to 0, this parameter has no effect.
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
@@ -1445,7 +1445,6 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
     tags=['chat', 'medical', '🔥'],
     hf_dataset_id='Flmc/DISC-Med-SFT')
 
-# hf_dataset_id='ShengbinYue/DISC-Law-SFT'
 register_dataset(
     DatasetName.disc_law_sft_zh,
     'AI-ModelScope/DISC-Law-SFT', ['train'],
@@ -1455,7 +1454,8 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
         'output': 'response'
     }),
     get_dataset_from_repo,
-    tags=['chat', 'law', '🔥'])
+    tags=['chat', 'law', '🔥'],
+    hf_dataset_id='ShengbinYue/DISC-Law-SFT')
 
 register_dataset(
     DatasetName.pileval,
@@ -1666,12 +1666,12 @@ def load_dataset_from_local(
     return concatenate_datasets(dataset_list)
 
 
-def get_custom_dataset(_: str, train_dataset_path_list: Union[str, List[str]],
-                       val_dataset_path_list: Optional[Union[str, List[str]]],
+def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]],
+                       val_subset_split_list: Optional[Union[str, List[str]]],
                        preprocess_func: PreprocessFunc,
                        **kwargs) -> Tuple[HfDataset, Optional[HfDataset]]:
-    train_dataset = load_dataset_from_local(train_dataset_path_list,
+    train_dataset = load_dataset_from_local(train_subset_split_list,
                                             preprocess_func)
-    val_dataset = load_dataset_from_local(val_dataset_path_list,
+    val_dataset = load_dataset_from_local(val_subset_split_list,
                                           preprocess_func)
     return train_dataset, val_dataset