NVIDIA · titu1994 · Jan 12, 2024 · Jan 11, 2024 · Jan 12, 2024 · Jan 12, 2024
diff --git a/docs/source/nlp/nemo_megatron/retro/retro_model.rst b/docs/source/nlp/nemo_megatron/retro/retro_model.rst
@@ -88,7 +88,7 @@ Following is the retro memory map index data format:
      - chunk id address in byte (int64 array)
      -
 
-:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float, 7: np.double, 8: np.uint16
+:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float64, 7: np.double, 8: np.uint16
 
 :sup:`2` When building the indexed dataset, we pad each sentence to be a multiple of ``chunk_size`` with ``pad_id`` from the tokenizer. 
 The number of tokens for each sentence includes the padded token ids. For retrieval data, there is an extra ``chunk_size`` padding at 

diff --git a/nemo/collections/asr/parts/utils/numba_utils.py b/nemo/collections/asr/parts/utils/numba_utils.py
@@ -28,7 +28,7 @@ def phase_vocoder(D: np.ndarray, rate: float, phi_advance: np.ndarray, scale_buf
     Returns:
         Complex64 ndarray of shape [d, t / rate, complex=2]
     """
-    time_steps = np.arange(0, D.shape[1], rate, dtype=np.float)
+    time_steps = np.arange(0, D.shape[1], rate, dtype=np.float64)
 
     # Create an empty output array
     d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F')

diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
@@ -111,7 +111,7 @@ def __getitem__(self, idx):
         return (
             np.array(self.all_input_ids[idx]),
             np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_input_mask[idx], dtype=np.longlong),
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_subtokens_mask[idx]),
             self.all_intents[idx],
@@ -326,7 +326,7 @@ def __getitem__(self, idx):
         return (
             np.array(self.all_input_ids[idx]),
             np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_input_mask[idx], dtype=np.longlong),
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_subtokens_mask[idx]),
         )
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py
@@ -155,7 +155,7 @@ def __getitem__(self, idx: int):
             np.array(ex.example_id_num[-1]),  # service_id
             np.array(ex.utterance_ids),
             np.array(ex.utterance_segment),
-            np.array(ex.utterance_mask, dtype=np.long),
+            np.array(ex.utterance_mask, dtype=np.longlong),
             np.array(ex.intent_status, dtype=np.float32),
             np.array(ex.requested_slot_status, dtype=np.float32),
             np.array(ex.categorical_slot_status),

diff --git a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py b/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py
@@ -183,7 +183,7 @@ def __getitem__(self, idx):
         return (
             np.array(feature.input_ids),
             np.array(feature.segment_ids),
-            np.array(feature.input_mask, dtype=np.long),
+            np.array(feature.input_mask, dtype=np.longlong),
             np.array(feature.label_id),
         )
 

diff --git a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py b/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py
@@ -131,7 +131,7 @@ def construct_input(self, token_ids1, max_seq_length, token_ids2=None):
         num_nonpad_tokens = len(bert_input)
 
         input_ids[:num_nonpad_tokens] = bert_input
-        input_ids = np.array(input_ids, dtype=np.long)
+        input_ids = np.array(input_ids, dtype=np.longlong)
         input_mask = input_ids != self.tokenizer.pad_id
         input_type_ids = np.ones_like(input_ids)
         input_type_ids[:sentence1_length] = 0

diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py
@@ -234,7 +234,7 @@ def __getitem__(self, idx):
         return (
             np.array(self.all_input_ids[idx]),
             np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_input_mask[idx], dtype=np.longlong),
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_subtokens_mask[idx]),
             self.all_intents[idx],
@@ -291,7 +291,7 @@ def __getitem__(self, idx):
         return (
             np.array(self.all_input_ids[idx]),
             np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_input_mask[idx], dtype=np.longlong),
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_subtokens_mask[idx]),
         )
diff --git a/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py b/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py
@@ -241,10 +241,10 @@ def truncate_seq_pair(a, b, max_num_tokens):
 
         input_ids, output_mask = self.mask_ids(output_ids)
 
-        input_mask = np.zeros(self.max_seq_length, dtype=np.long)
+        input_mask = np.zeros(self.max_seq_length, dtype=np.longlong)
         input_mask[: len(input_ids)] = 1
 
-        input_type_ids = np.zeros(self.max_seq_length, dtype=np.int)
+        input_type_ids = np.zeros(self.max_seq_length, dtype=np.int64)
         input_type_ids[len(a_document) + 2 : len(output_ids) + 1] = 1
 
         padding_length = max(0, self.max_seq_length - len(input_ids))
@@ -257,7 +257,7 @@ def truncate_seq_pair(a, b, max_num_tokens):
         return (
             np.array(input_ids),
             input_type_ids,
-            np.array(input_mask, dtype=np.long),
+            np.array(input_mask, dtype=np.longlong),
             np.array(output_ids),
             np.array(output_mask, dtype=np.float32),
             is_next,

diff --git a/nemo/collections/nlp/data/language_modeling/sentence_dataset.py b/nemo/collections/nlp/data/language_modeling/sentence_dataset.py
@@ -68,7 +68,7 @@ def pad_batches(self, ids):
 
         batches = []
         for batch_elem_len, batch_sent_ids in zip(self.batch_elem_lengths, self.batch_sent_ids):
-            batch = self.tokenizer.pad_id * np.ones((len(batch_sent_ids), batch_elem_len), dtype=np.int)
+            batch = self.tokenizer.pad_id * np.ones((len(batch_sent_ids), batch_elem_len), dtype=np.int64)
             for i, sentence_idx in enumerate(batch_sent_ids):
                 batch[i][: len(ids[sentence_idx])] = ids[sentence_idx]
             batches.append(batch)

diff --git a/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py b/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py
@@ -164,8 +164,8 @@ def pad_batches(self, src_ids, tgt_ids, batch_indices):
         for batch_idx, b in enumerate(batch_indices):
             src_len = max([len(src_ids[i]) for i in b])
             tgt_len = max([len(tgt_ids[i]) for i in b])
-            src_ids_ = self.src_pad_id * np.ones((len(b), src_len), dtype=np.int)
-            tgt_ids_ = self.tgt_pad_id * np.ones((len(b), tgt_len), dtype=np.int)
+            src_ids_ = self.src_pad_id * np.ones((len(b), src_len), dtype=np.int64)
+            tgt_ids_ = self.tgt_pad_id * np.ones((len(b), tgt_len), dtype=np.int64)
             for i, sentence_idx in enumerate(b):
                 src_ids_[i][: len(src_ids[sentence_idx])] = src_ids[sentence_idx]
                 tgt_ids_[i][: len(tgt_ids[sentence_idx])] = tgt_ids[sentence_idx]

diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py
@@ -1647,7 +1647,7 @@ def _form_batches(
               - ``'input_mask'``: a boolean numpy array;
               - ``'loss_mask'``: a boolean numpy array.
             If ``waveforms`` is not ``None``, then a batch also contain items
-              - ``features``: a ``np.float`` numpy array.
+              - ``features``: a ``np.float64`` numpy array.
               - ``features_length`` a ``np.int32`` numpy array.
             If ``audio_filepaths`` is not ``None``, then a natch also contain items
               - ``audio_filepaths`` a list of strings.
@@ -1677,7 +1677,7 @@ def _form_batches(
                 "capit_labels": item[3].astype(np.int64),
             }
             if self.use_audio and self.preload_audios:
-                batch['features'] = item[4].astype(np.float)
+                batch['features'] = item[4].astype(np.float64)
                 batch['features_length'] = item[5]
             elif self.use_audio and not self.preload_audios:
                 batch['audio_filepaths'] = item[6]
@@ -1730,7 +1730,7 @@ def _pack_into_batches(
               - ``'input_mask'``: a boolean numpy array;
               - ``'loss_mask'``: a boolean numpy array.
             If ``waveforms`` is not ``None``, then a batch also contain items
-              - ``features``: a ``np.float`` numpy array.
+              - ``features``: a ``np.float64`` numpy array.
               - ``features_length`` a ``np.int32`` numpy array.
             If ``audio_filepaths`` is not ``None``, then a natch also contain items
               - ``audio_filepaths`` a list of strings.
@@ -1785,7 +1785,7 @@ def _pack_into_batches(
             if self.use_audio and self.preload_audios:
                 batch['features'] = pad(
                     waveforms[start : start + size], max(audio_lengths[start : start + size]), 0.0
-                ).astype(np.float)
+                ).astype(np.float64)
                 batch['features_length'] = audio_lengths[start : start + size]
             elif self.use_audio and not self.preload_audios:
                 batch['audio_filepaths'] = audio_filepaths[start : start + size]
@@ -1993,8 +1993,8 @@ def __getitem__(self, idx: int) -> Dict[str, np.ndarray]:
                 computed for corresponding token. See more in description of constructor parameters
                 ``ignore_start_end``, ``ignore_extra_tokens`` (if ``self.add_masks_and_segment_ids_to_batch`` is
                 ``False``, then these items is missing).
-              - ``'features'`` (:obj:`numpy.ndarray`) :obj:`np.float` array of waveforms of audio if ``self.preload_audio`` is set to ``True`` else empty.
-              - ``'features_length'`` (:obj:`numpy.ndarray`) :obj:`np.long` array of number of samples per audio.
+              - ``'features'`` (:obj:`numpy.ndarray`) :obj:`np.float64` array of waveforms of audio if ``self.preload_audio`` is set to ``True`` else empty.
+              - ``'features_length'`` (:obj:`numpy.ndarray`) :obj:`np.longlong` array of number of samples per audio.
               - ``'audio_filepaths'`` (:obj:`List`) :obj:`str` contains paths of audio files if ``self.preload_audio`` set to ``False``
         """
         return self.batches[idx]
diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py
@@ -460,6 +460,6 @@ def __getitem__(
             self.all_query_ids[idx],
             self.all_is_first[idx],
             self.all_is_last[idx],
-            np.array(self.all_audio_queries[idx], dtype=np.float),
+            np.array(self.all_audio_queries[idx], dtype=np.float64),
             self.all_audio_lengths[idx],
         )
diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py
@@ -300,7 +300,7 @@ def __getitem__(self, idx):
         return (
             np.array(self.all_input_ids[idx]),
             np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_input_mask[idx], dtype=np.longlong),
             np.array(self.all_subtokens_mask[idx]),
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_labels[idx]),
@@ -348,6 +348,6 @@ def __getitem__(self, idx):
         return (
             np.array(self.all_input_ids[idx]),
             np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_input_mask[idx], dtype=np.longlong),
             np.array(self.all_subtokens_mask[idx]),
         )
diff --git a/nemo/collections/vision/data/megatron/autoaugment.py b/nemo/collections/vision/data/megatron/autoaugment.py
@@ -160,7 +160,7 @@ def __init__(
             "translateY": np.linspace(0, 150 / 331, num_levels),
             "rotate": np.linspace(0, 30, num_levels),
             "color": np.linspace(0.0, 0.9, num_levels),
-            "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(np.int),
+            "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(np.int64),
             "solarize": np.linspace(256, 0, num_levels),  # range [0, 256]
             "contrast": np.linspace(0.0, 0.9, num_levels),
             "sharpness": np.linspace(0.0, 0.9, num_levels),

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,6 +1,6 @@
 huggingface_hub
 numba
-numpy>=1.22,<1.24
+numpy>=1.22
 onnx>=1.7.0
 python-dateutil
 ruamel.yaml

diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt
@@ -1,7 +1,6 @@
 datasets
 inflect
 pandas
-pydantic<2  # remove after inflect supports Pydantic 2.0+
 sacremoses>=0.0.43
 sentencepiece<1.0.0
 youtokentome>=1.0.5