Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

memmap worker arg #7062

Merged
merged 9 commits into from
Jul 20, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 0
memmap_workers: null
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -143,6 +144,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: False
num_workers: 0
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -170,6 +172,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: False
num_workers: 4
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 4
memmap_workers: null
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -109,6 +110,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 4
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -137,6 +139,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 4
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import numpy as np
import torch

Expand Down Expand Up @@ -40,12 +42,13 @@ def __init__(
label_key: str = "answer",
separate_prompt_and_response_with_newline: bool = False,
answer_only_loss: bool = True,
truncation_field: str = "answer",
truncation_field: str = "context",
pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings.
index_mapping_dir: str = None,
prompt_template: str = None,
virtual_tokens: int = 0,
tokens_to_generate: int = 0,
memmap_workers: Optional[int] = None,
):
"""
file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
Expand Down Expand Up @@ -94,7 +97,11 @@ def __init__(
assert self.truncation_field in ["answer", "context"]

self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir,
workers=memmap_workers,
)

# Will be None after this call if `max_num_samples` is None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ def _build_dataset(self, data_cfg, is_train=True):
tokens_to_generate=data_cfg.get(
'tokens_to_generate', 0
), # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
memmap_workers=data_cfg.get(
'memmap_workers', None
), # used to set num. of workers to create the memmap index files
)
datasets.append(dataset)

Expand Down