From 44b0c15c510242d7f412d71438cd076430ed9f08 Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Thu, 23 Feb 2023 09:45:52 -0800 Subject: [PATCH 1/4] 1. Improved testing of text memmap generated index files, and improved error message when files are missing. Signed-off-by: Micha Livne --- .../data/language_modeling/text_memmap_dataset.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index f6931d9c17a9..2bb96125cfdd 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -195,7 +195,7 @@ def load_file(self, fn): # create data map mdata = np.memmap(fn, dtype=np.uint8, mode='r') - if os.path.exists(idx_fn + ".npy"): + if _index_file_exists(idx_fn): # load index file into memory map midx = np.load(idx_fn + ".npy", allow_pickle=True, mmap_mode='r') # test for header @@ -219,7 +219,7 @@ def load_file(self, fn): f"Version mismatch: Please delete existing '.{__idx_suffix__}' files. Expected version = {__idx_version__}, but file version = {idx_version}. File path = {idx_fn}" ) else: - raise ValueError(f'Memory Map for {fn} is not found') + raise ValueError(f'Memory Map for {fn} is not found, missing one or more of files: {idx_fn}.{{.npy,.info}}') return (mdata, midx) @@ -280,13 +280,19 @@ def _build_data_from_text(self, text): """Return a dictionary of data based on a single JSON line.""" return json.loads(text) +def _index_file_exists(idx_fn): + """Helper function to test if index file exists""" + if os.path.exists(idx_fn + ".npy") and os.path.exists(idx_fn + ".info"): + return True + else: + return False def _build_memmap_index_files(newline_int, build_index_fn, fn): """Helper function to build an index file""" idx_fn = f"{fn}.{__idx_suffix__}" # create data map - if os.path.exists(idx_fn + ".npy"): + if _index_file_exists(idx_fn): return False else: logging.info(f"Building indexing for fn = {fn}") From e9f7c933c52bb7d1357eb18d08a74ef85e6180dc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Feb 2023 17:48:57 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/data/language_modeling/text_memmap_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index 2bb96125cfdd..f24faf0b26de 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -219,7 +219,9 @@ def load_file(self, fn): f"Version mismatch: Please delete existing '.{__idx_suffix__}' files. Expected version = {__idx_version__}, but file version = {idx_version}. File path = {idx_fn}" ) else: - raise ValueError(f'Memory Map for {fn} is not found, missing one or more of files: {idx_fn}.{{.npy,.info}}') + raise ValueError( + f'Memory Map for {fn} is not found, missing one or more of files: {idx_fn}.{{.npy,.info}}' + ) return (mdata, midx) @@ -280,6 +282,7 @@ def _build_data_from_text(self, text): """Return a dictionary of data based on a single JSON line.""" return json.loads(text) + def _index_file_exists(idx_fn): """Helper function to test if index file exists""" if os.path.exists(idx_fn + ".npy") and os.path.exists(idx_fn + ".info"): @@ -287,6 +290,7 @@ def _index_file_exists(idx_fn): else: return False + def _build_memmap_index_files(newline_int, build_index_fn, fn): """Helper function to build an index file""" idx_fn = f"{fn}.{__idx_suffix__}" From 908c7b210df64c3dc8f95d3cadf62dde6edf34e3 Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Mon, 6 Mar 2023 14:08:09 -0800 Subject: [PATCH 3/4] 1. Improved error messages to help debugging failure cases of text memmap. Signed-off-by: Micha Livne --- .../data/language_modeling/text_memmap_dataset.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index f24faf0b26de..bbd604ad5b61 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -157,9 +157,20 @@ def __getitem__(self, idx): # fetch sample from memmap - sample = self._fetch_sample_from_memmap(mdata, i, j) + try: + sample = self._fetch_sample_from_memmap(mdata, i, j) + except Exception as e: + logging.error(f"Error while fetching sample from memmap: {e}") + logging.error(f"file_id: {file_id}, file_idx: {file_idx}, i: {i}, j: {j}") + raise e + # parse raw text (e.g., tokenize) - data = self._build_data_from_text(sample) + try: + data = self._build_data_from_text(sample) + except Exception as e: + logging.error(f"Error while building data from text, possible issue with sample expected format (see offending sample below): {e}") + logging.error(f"sample: {sample}, file_id: {file_id}, file_idx: {file_idx}, i: {i}, j: {j}") + raise e return data From a771f1e8bc642c4f40620ace098daa2dc095f307 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Mar 2023 22:09:20 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/data/language_modeling/text_memmap_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py index bbd604ad5b61..b26f213282bb 100644 --- a/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py @@ -168,7 +168,9 @@ def __getitem__(self, idx): try: data = self._build_data_from_text(sample) except Exception as e: - logging.error(f"Error while building data from text, possible issue with sample expected format (see offending sample below): {e}") + logging.error( + f"Error while building data from text, possible issue with sample expected format (see offending sample below): {e}" + ) logging.error(f"sample: {sample}, file_id: {file_id}, file_idx: {file_idx}, i: {i}, j: {j}") raise e