[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
NVIDIA · May 19, 2023 · 28eddfd · 28eddfd
1 parent 063c16e
commit 28eddfd
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 23 deletions.
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -94,10 +94,8 @@ def __init__(
         assert self.truncation_field in ["answer", "context"]
 
         self.indexed_dataset = JSONLMemMapDataset(
-            dataset_paths=[file_path],
-            tokenizer=None,
-            header_lines=0,
-            index_mapping_dir=index_mapping_dir)
+            dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir
+        )
 
         # Will be None after this call if `max_num_samples` is None
         self._build_samples_mapping()

diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
@@ -19,8 +19,8 @@
 import os
 import pickle
 import time
-from typing import Callable, List, Optional, Type
 from functools import partial
+from typing import Callable, List, Optional, Type
 
 import numpy as np
 import torch
@@ -33,7 +33,7 @@
 __idx_suffix__ = 'idx'  # index file suffix
 
 
-def _create_short_hash(string: str ) -> str:
+def _create_short_hash(string: str) -> str:
     """Utility function to compute hash of a string.
 
     This is used to create unique filenames from a full path.
@@ -162,7 +162,8 @@ def __init__(
                 newline_int,
                 workers=self._worker,
                 build_index_fn=build_index_fn,
-                index_mapping_dir=index_mapping_dir)
+                index_mapping_dir=index_mapping_dir,
+            )
 
         if is_ditributed:
             torch.distributed.barrier()
@@ -423,11 +424,8 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir
 
 
 def build_index_files(
-        dataset_paths,
-        newline_int,
-        workers=None,
-        build_index_fn=_build_index_from_memdata,
-        index_mapping_dir: str = None):
+    dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None
+):
     """Auxiliary method to build multiple index files"""
     if len(dataset_paths) < 1:
         raise ValueError("files_list must contain at leat one file name")
@@ -439,7 +437,10 @@ def build_index_files(
     # load all files into memmap
     start_time = time.time()
     with mp.Pool(workers) as p:
-        build_status = p.map(partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir), dataset_paths)
+        build_status = p.map(
+            partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir),
+            dataset_paths,
+        )
 
     logging.info(
         f'Time building {sum(build_status)} / {len(build_status)} mem-mapped files: {datetime.timedelta(seconds=time.time() - start_time)}'

diff --git a/tests/collections/nlp/test_jsonl_mem_map_dataset.py b/tests/collections/nlp/test_jsonl_mem_map_dataset.py
@@ -12,23 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
+
 import pytest
-import json
 
 from nemo.collections.nlp.data.language_modeling import text_memmap_dataset
 
+
 @pytest.fixture
 def jsonl_file(tmp_path):
     # Create a temporary file path
     file_path = tmp_path / "data.jsonl"
 
     # Generate data to write to the JSONL file
-    data = [
-        {"name": "John", "age": 30},
-        {"name": "Jane", "age": 25},
-        {"name": "Bob", "age": 35}
-    ]
+    data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}, {"name": "Bob", "age": 35}]
 
     # Write data to the JSONL file
     with open(file_path, mode="w") as file:
@@ -61,10 +59,8 @@ def test_jsonl_mem_map_dataset_index_mapping_dir(tmp_path, jsonl_file, use_alter
         index_mapping_dir = tmp_path / "subdir"
         index_mapping_dir.mkdir()
         text_memmap_dataset.JSONLMemMapDataset(
-            dataset_paths=[jsonl_file],
-            header_lines=0,
-            index_mapping_dir=str(index_mapping_dir)
-            )
+            dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir)
+        )
         # Index files should not be created in default location.
         assert not os.path.isfile(f"{jsonl_file}.idx.npy")
         assert not os.path.isfile(f"{jsonl_file}.idx.info")