From 973a7ec4bbd29dc2bb02dce2a75aed7eed0770ae Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Fri, 12 Jan 2024 13:40:16 -0800 Subject: [PATCH] Update dependencies (#8156) * Update dependencies Signed-off-by: smajumdar * Update numpy.long to numpy.longlong (long deprecated in np 24) Signed-off-by: smajumdar * Update all deprecatd numpy types Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Signed-off-by: Pablo Garay --- docs/source/nlp/nemo_megatron/retro/retro_model.rst | 2 +- nemo/collections/asr/parts/utils/numba_utils.py | 2 +- .../data/dialogue/dataset/dialogue_bert_dataset.py | 4 ++-- .../dialogue/dataset/dialogue_sgd_bert_dataset.py | 2 +- .../data/glue_benchmark/glue_benchmark_dataset.py | 2 +- .../information_retrieval_dataset.py | 2 +- .../intent_slot_classification_dataset.py | 4 ++-- .../nlp/data/language_modeling/lm_bert_dataset.py | 6 +++--- .../nlp/data/language_modeling/sentence_dataset.py | 2 +- .../machine_translation_dataset.py | 4 ++-- .../punctuation_capitalization_dataset.py | 12 ++++++------ .../punctuation_capitalization_infer_dataset.py | 2 +- .../token_classification_dataset.py | 4 ++-- nemo/collections/vision/data/megatron/autoaugment.py | 2 +- requirements/requirements.txt | 2 +- requirements/requirements_common.txt | 1 - 16 files changed, 26 insertions(+), 27 deletions(-) diff --git a/docs/source/nlp/nemo_megatron/retro/retro_model.rst b/docs/source/nlp/nemo_megatron/retro/retro_model.rst index ceff1baf857f..e653e8200c6a 100644 --- a/docs/source/nlp/nemo_megatron/retro/retro_model.rst +++ b/docs/source/nlp/nemo_megatron/retro/retro_model.rst @@ -88,7 +88,7 @@ Following is the retro memory map index data format: - chunk id address in byte (int64 array) - -:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float, 7: np.double, 8: np.uint16 +:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float64, 7: np.double, 8: np.uint16 :sup:`2` When building the indexed dataset, we pad each sentence to be a multiple of ``chunk_size`` with ``pad_id`` from the tokenizer. The number of tokens for each sentence includes the padded token ids. For retrieval data, there is an extra ``chunk_size`` padding at diff --git a/nemo/collections/asr/parts/utils/numba_utils.py b/nemo/collections/asr/parts/utils/numba_utils.py index 8a5048aaf748..867ecf59521b 100644 --- a/nemo/collections/asr/parts/utils/numba_utils.py +++ b/nemo/collections/asr/parts/utils/numba_utils.py @@ -28,7 +28,7 @@ def phase_vocoder(D: np.ndarray, rate: float, phi_advance: np.ndarray, scale_buf Returns: Complex64 ndarray of shape [d, t / rate, complex=2] """ - time_steps = np.arange(0, D.shape[1], rate, dtype=np.float) + time_steps = np.arange(0, D.shape[1], rate, dtype=np.float64) # Create an empty output array d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F') diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py index edab77a056e1..0931fe383f94 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py @@ -111,7 +111,7 @@ def __getitem__(self, idx): return ( np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_input_mask[idx], dtype=np.longlong), np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), self.all_intents[idx], @@ -326,7 +326,7 @@ def __getitem__(self, idx): return ( np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_input_mask[idx], dtype=np.longlong), np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), ) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py index 364998e8e58a..fcab5e91329f 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py @@ -155,7 +155,7 @@ def __getitem__(self, idx: int): np.array(ex.example_id_num[-1]), # service_id np.array(ex.utterance_ids), np.array(ex.utterance_segment), - np.array(ex.utterance_mask, dtype=np.long), + np.array(ex.utterance_mask, dtype=np.longlong), np.array(ex.intent_status, dtype=np.float32), np.array(ex.requested_slot_status, dtype=np.float32), np.array(ex.categorical_slot_status), diff --git a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py b/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py index 2a14aa5afc58..ef7845895a72 100644 --- a/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py +++ b/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py @@ -183,7 +183,7 @@ def __getitem__(self, idx): return ( np.array(feature.input_ids), np.array(feature.segment_ids), - np.array(feature.input_mask, dtype=np.long), + np.array(feature.input_mask, dtype=np.longlong), np.array(feature.label_id), ) diff --git a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py b/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py index 61e0a3cffc1f..349f9e43ef97 100644 --- a/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py +++ b/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py @@ -131,7 +131,7 @@ def construct_input(self, token_ids1, max_seq_length, token_ids2=None): num_nonpad_tokens = len(bert_input) input_ids[:num_nonpad_tokens] = bert_input - input_ids = np.array(input_ids, dtype=np.long) + input_ids = np.array(input_ids, dtype=np.longlong) input_mask = input_ids != self.tokenizer.pad_id input_type_ids = np.ones_like(input_ids) input_type_ids[:sentence1_length] = 0 diff --git a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py index cf0081f7bd83..a73341aa719d 100644 --- a/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py +++ b/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py @@ -234,7 +234,7 @@ def __getitem__(self, idx): return ( np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_input_mask[idx], dtype=np.longlong), np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), self.all_intents[idx], @@ -291,7 +291,7 @@ def __getitem__(self, idx): return ( np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_input_mask[idx], dtype=np.longlong), np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), ) diff --git a/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py b/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py index 4196f9736da1..b02d25016b68 100644 --- a/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py @@ -241,10 +241,10 @@ def truncate_seq_pair(a, b, max_num_tokens): input_ids, output_mask = self.mask_ids(output_ids) - input_mask = np.zeros(self.max_seq_length, dtype=np.long) + input_mask = np.zeros(self.max_seq_length, dtype=np.longlong) input_mask[: len(input_ids)] = 1 - input_type_ids = np.zeros(self.max_seq_length, dtype=np.int) + input_type_ids = np.zeros(self.max_seq_length, dtype=np.int64) input_type_ids[len(a_document) + 2 : len(output_ids) + 1] = 1 padding_length = max(0, self.max_seq_length - len(input_ids)) @@ -257,7 +257,7 @@ def truncate_seq_pair(a, b, max_num_tokens): return ( np.array(input_ids), input_type_ids, - np.array(input_mask, dtype=np.long), + np.array(input_mask, dtype=np.longlong), np.array(output_ids), np.array(output_mask, dtype=np.float32), is_next, diff --git a/nemo/collections/nlp/data/language_modeling/sentence_dataset.py b/nemo/collections/nlp/data/language_modeling/sentence_dataset.py index 26127bc3aa36..c843d2e0f938 100644 --- a/nemo/collections/nlp/data/language_modeling/sentence_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/sentence_dataset.py @@ -68,7 +68,7 @@ def pad_batches(self, ids): batches = [] for batch_elem_len, batch_sent_ids in zip(self.batch_elem_lengths, self.batch_sent_ids): - batch = self.tokenizer.pad_id * np.ones((len(batch_sent_ids), batch_elem_len), dtype=np.int) + batch = self.tokenizer.pad_id * np.ones((len(batch_sent_ids), batch_elem_len), dtype=np.int64) for i, sentence_idx in enumerate(batch_sent_ids): batch[i][: len(ids[sentence_idx])] = ids[sentence_idx] batches.append(batch) diff --git a/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py b/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py index efb0cef86f4e..568ae6a7d684 100644 --- a/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py +++ b/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py @@ -164,8 +164,8 @@ def pad_batches(self, src_ids, tgt_ids, batch_indices): for batch_idx, b in enumerate(batch_indices): src_len = max([len(src_ids[i]) for i in b]) tgt_len = max([len(tgt_ids[i]) for i in b]) - src_ids_ = self.src_pad_id * np.ones((len(b), src_len), dtype=np.int) - tgt_ids_ = self.tgt_pad_id * np.ones((len(b), tgt_len), dtype=np.int) + src_ids_ = self.src_pad_id * np.ones((len(b), src_len), dtype=np.int64) + tgt_ids_ = self.tgt_pad_id * np.ones((len(b), tgt_len), dtype=np.int64) for i, sentence_idx in enumerate(b): src_ids_[i][: len(src_ids[sentence_idx])] = src_ids[sentence_idx] tgt_ids_[i][: len(tgt_ids[sentence_idx])] = tgt_ids[sentence_idx] diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py index cfc65b090b98..d82ee36a8833 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py @@ -1647,7 +1647,7 @@ def _form_batches( - ``'input_mask'``: a boolean numpy array; - ``'loss_mask'``: a boolean numpy array. If ``waveforms`` is not ``None``, then a batch also contain items - - ``features``: a ``np.float`` numpy array. + - ``features``: a ``np.float64`` numpy array. - ``features_length`` a ``np.int32`` numpy array. If ``audio_filepaths`` is not ``None``, then a natch also contain items - ``audio_filepaths`` a list of strings. @@ -1677,7 +1677,7 @@ def _form_batches( "capit_labels": item[3].astype(np.int64), } if self.use_audio and self.preload_audios: - batch['features'] = item[4].astype(np.float) + batch['features'] = item[4].astype(np.float64) batch['features_length'] = item[5] elif self.use_audio and not self.preload_audios: batch['audio_filepaths'] = item[6] @@ -1730,7 +1730,7 @@ def _pack_into_batches( - ``'input_mask'``: a boolean numpy array; - ``'loss_mask'``: a boolean numpy array. If ``waveforms`` is not ``None``, then a batch also contain items - - ``features``: a ``np.float`` numpy array. + - ``features``: a ``np.float64`` numpy array. - ``features_length`` a ``np.int32`` numpy array. If ``audio_filepaths`` is not ``None``, then a natch also contain items - ``audio_filepaths`` a list of strings. @@ -1785,7 +1785,7 @@ def _pack_into_batches( if self.use_audio and self.preload_audios: batch['features'] = pad( waveforms[start : start + size], max(audio_lengths[start : start + size]), 0.0 - ).astype(np.float) + ).astype(np.float64) batch['features_length'] = audio_lengths[start : start + size] elif self.use_audio and not self.preload_audios: batch['audio_filepaths'] = audio_filepaths[start : start + size] @@ -1993,8 +1993,8 @@ def __getitem__(self, idx: int) -> Dict[str, np.ndarray]: computed for corresponding token. See more in description of constructor parameters ``ignore_start_end``, ``ignore_extra_tokens`` (if ``self.add_masks_and_segment_ids_to_batch`` is ``False``, then these items is missing). - - ``'features'`` (:obj:`numpy.ndarray`) :obj:`np.float` array of waveforms of audio if ``self.preload_audio`` is set to ``True`` else empty. - - ``'features_length'`` (:obj:`numpy.ndarray`) :obj:`np.long` array of number of samples per audio. + - ``'features'`` (:obj:`numpy.ndarray`) :obj:`np.float64` array of waveforms of audio if ``self.preload_audio`` is set to ``True`` else empty. + - ``'features_length'`` (:obj:`numpy.ndarray`) :obj:`np.longlong` array of number of samples per audio. - ``'audio_filepaths'`` (:obj:`List`) :obj:`str` contains paths of audio files if ``self.preload_audio`` set to ``False`` """ return self.batches[idx] diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py index 570c672f405e..8439a90e2eae 100644 --- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py +++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_infer_dataset.py @@ -460,6 +460,6 @@ def __getitem__( self.all_query_ids[idx], self.all_is_first[idx], self.all_is_last[idx], - np.array(self.all_audio_queries[idx], dtype=np.float), + np.array(self.all_audio_queries[idx], dtype=np.float64), self.all_audio_lengths[idx], ) diff --git a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py index dd6523740191..4f49e34ce24e 100644 --- a/nemo/collections/nlp/data/token_classification/token_classification_dataset.py +++ b/nemo/collections/nlp/data/token_classification/token_classification_dataset.py @@ -300,7 +300,7 @@ def __getitem__(self, idx): return ( np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_input_mask[idx], dtype=np.longlong), np.array(self.all_subtokens_mask[idx]), np.array(self.all_loss_mask[idx]), np.array(self.all_labels[idx]), @@ -348,6 +348,6 @@ def __getitem__(self, idx): return ( np.array(self.all_input_ids[idx]), np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_input_mask[idx], dtype=np.longlong), np.array(self.all_subtokens_mask[idx]), ) diff --git a/nemo/collections/vision/data/megatron/autoaugment.py b/nemo/collections/vision/data/megatron/autoaugment.py index 4048e0d78472..932b37553fea 100644 --- a/nemo/collections/vision/data/megatron/autoaugment.py +++ b/nemo/collections/vision/data/megatron/autoaugment.py @@ -160,7 +160,7 @@ def __init__( "translateY": np.linspace(0, 150 / 331, num_levels), "rotate": np.linspace(0, 30, num_levels), "color": np.linspace(0.0, 0.9, num_levels), - "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(np.int), + "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(np.int64), "solarize": np.linspace(256, 0, num_levels), # range [0, 256] "contrast": np.linspace(0.0, 0.9, num_levels), "sharpness": np.linspace(0.0, 0.9, num_levels), diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 05b4531ff083..8d3441ff32df 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ huggingface_hub numba -numpy>=1.22,<1.24 +numpy>=1.22 onnx>=1.7.0 python-dateutil ruamel.yaml diff --git a/requirements/requirements_common.txt b/requirements/requirements_common.txt index 8410d7f0c29d..4d4d076c8a92 100644 --- a/requirements/requirements_common.txt +++ b/requirements/requirements_common.txt @@ -1,7 +1,6 @@ datasets inflect pandas -pydantic<2 # remove after inflect supports Pydantic 2.0+ sacremoses>=0.0.43 sentencepiece<1.0.0 youtokentome>=1.0.5