Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed May 19, 2023
1 parent 063c16e commit 28eddfd
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,8 @@ def __init__(
assert self.truncation_field in ["answer", "context"]

self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir)
dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir
)

# Will be None after this call if `max_num_samples` is None
self._build_samples_mapping()
Expand Down
19 changes: 10 additions & 9 deletions nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import os
import pickle
import time
from typing import Callable, List, Optional, Type
from functools import partial
from typing import Callable, List, Optional, Type

import numpy as np
import torch
Expand All @@ -33,7 +33,7 @@
__idx_suffix__ = 'idx' # index file suffix


def _create_short_hash(string: str ) -> str:
def _create_short_hash(string: str) -> str:
"""Utility function to compute hash of a string.
This is used to create unique filenames from a full path.
Expand Down Expand Up @@ -162,7 +162,8 @@ def __init__(
newline_int,
workers=self._worker,
build_index_fn=build_index_fn,
index_mapping_dir=index_mapping_dir)
index_mapping_dir=index_mapping_dir,
)

if is_ditributed:
torch.distributed.barrier()
Expand Down Expand Up @@ -423,11 +424,8 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir


def build_index_files(
dataset_paths,
newline_int,
workers=None,
build_index_fn=_build_index_from_memdata,
index_mapping_dir: str = None):
dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None
):
"""Auxiliary method to build multiple index files"""
if len(dataset_paths) < 1:
raise ValueError("files_list must contain at leat one file name")
Expand All @@ -439,7 +437,10 @@ def build_index_files(
# load all files into memmap
start_time = time.time()
with mp.Pool(workers) as p:
build_status = p.map(partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir), dataset_paths)
build_status = p.map(
partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir),
dataset_paths,
)

logging.info(
f'Time building {sum(build_status)} / {len(build_status)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}'
Expand Down
16 changes: 6 additions & 10 deletions tests/collections/nlp/test_jsonl_mem_map_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os

import pytest
import json

from nemo.collections.nlp.data.language_modeling import text_memmap_dataset


@pytest.fixture
def jsonl_file(tmp_path):
# Create a temporary file path
file_path = tmp_path / "data.jsonl"

# Generate data to write to the JSONL file
data = [
{"name": "John", "age": 30},
{"name": "Jane", "age": 25},
{"name": "Bob", "age": 35}
]
data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}, {"name": "Bob", "age": 35}]

# Write data to the JSONL file
with open(file_path, mode="w") as file:
Expand Down Expand Up @@ -61,10 +59,8 @@ def test_jsonl_mem_map_dataset_index_mapping_dir(tmp_path, jsonl_file, use_alter
index_mapping_dir = tmp_path / "subdir"
index_mapping_dir.mkdir()
text_memmap_dataset.JSONLMemMapDataset(
dataset_paths=[jsonl_file],
header_lines=0,
index_mapping_dir=str(index_mapping_dir)
)
dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir)
)
# Index files should not be created in default location.
assert not os.path.isfile(f"{jsonl_file}.idx.npy")
assert not os.path.isfile(f"{jsonl_file}.idx.info")
Expand Down

0 comments on commit 28eddfd

Please sign in to comment.