diff --git a/env/cpu/py2.yml b/env/cpu/py2.yml deleted file mode 100644 index a41131b0e2..0000000000 --- a/env/cpu/py2.yml +++ /dev/null @@ -1,21 +0,0 @@ -channels: - - conda-forge -dependencies: - - python=2.7 - - pip=18.1 - - perl - - pylint=1.9.2 - - flake8 - - sphinx=1.7.7 - - spacy - - nltk - - pytest=4.5.0 - - flaky=3.5.3 - - pytest-cov=2.7.1 - - mock<3 - - pytest-xdist<2 - - regex - - pip: - - pylint-quotes<0.2 - - mxnet-mkl>=1.4.1 - - sentencepiece<0.2 diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml index 758078eb95..e112cded68 100644 --- a/env/cpu/py3-master.yml +++ b/env/cpu/py3-master.yml @@ -4,7 +4,7 @@ dependencies: - python=3.6 - pip=18.1 - perl - - pylint=1.9.2 + - pylint=2.3.1 - flake8 - sphinx=1.7.7 - spacy @@ -16,7 +16,7 @@ dependencies: - pytest-xdist<2 - regex - pip: - - pylint-quotes<0.2 + - pylint-quotes==0.2.1 - mxnet-mkl>=1.5.0b20190407 - sacremoses - sentencepiece<0.2 diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml index e3e14cc31b..35701d0afa 100644 --- a/env/cpu/py3.yml +++ b/env/cpu/py3.yml @@ -4,7 +4,7 @@ dependencies: - python=3.6 - pip=18.1 - perl - - pylint=1.9.2 + - pylint=2.3.1 - flake8 - sphinx=1.7.7 - spacy @@ -26,7 +26,7 @@ dependencies: - ipykernel - regex - pip: - - pylint-quotes<0.2 + - pylint-quotes==0.2.1 - mxnet-mkl>=1.4.1 - sacremoses - sentencepiece<0.2 diff --git a/env/gpu/py2.yml b/env/gpu/py2.yml deleted file mode 100644 index 2e8b04bb4a..0000000000 --- a/env/gpu/py2.yml +++ /dev/null @@ -1,21 +0,0 @@ -channels: - - conda-forge -dependencies: - - python=2.7 - - pip=18.1 - - perl - - pylint=1.9.2 - - flake8 - - sphinx=1.7.7 - - spacy - - nltk - - pytest=4.5.0 - - flaky=3.5.3 - - pytest-cov=2.7.1 - - mock<3 - - pytest-xdist<2 - - regex - - pip: - - pylint-quotes<0.2 - - mxnet-cu92mkl>=1.4.1 - - sentencepiece<0.2 diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml index 84669b83d1..e5e1bfef18 100644 --- a/env/gpu/py3-master.yml +++ b/env/gpu/py3-master.yml @@ -4,7 +4,7 @@ dependencies: - python=3.6 - pip=18.1 - perl - - pylint=1.9.2 + - pylint=2.3.1 - flake8 - sphinx=1.7.7 - spacy diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml index 536d1ec51f..6643387782 100644 --- a/env/gpu/py3.yml +++ b/env/gpu/py3.yml @@ -4,7 +4,7 @@ dependencies: - python=3.6 - pip=18.1 - perl - - pylint=1.9.2 + - pylint=2.3.1 - flake8 - sphinx=1.7.7 - spacy @@ -26,7 +26,7 @@ dependencies: - ipykernel - regex - pip: - - pylint-quotes<0.2 + - pylint-quotes==0.2.1 - mxnet-cu92mkl>=1.4.1 - sacremoses - sentencepiece<0.2 diff --git a/scripts/bert/create_pretraining_data.py b/scripts/bert/create_pretraining_data.py index f1230e4608..6d81cdcbe0 100644 --- a/scripts/bert/create_pretraining_data.py +++ b/scripts/bert/create_pretraining_data.py @@ -33,7 +33,7 @@ from gluonnlp.data import BERTTokenizer -class TrainingInstance(object): +class TrainingInstance: """A single training instance (sentence pair).""" def __init__(self, tokens, segment_ids, masked_lm_positions, diff --git a/scripts/bert/data/classification.py b/scripts/bert/data/classification.py index 3a8443a8b7..5549e06522 100644 --- a/scripts/bert/data/classification.py +++ b/scripts/bert/data/classification.py @@ -30,7 +30,7 @@ from baidu_ernie_data import BaiduErnieXNLI, BaiduErnieLCQMC, BaiduErnieChnSentiCorp -class GlueTask(object): +class GlueTask: """Abstract GLUE task class. Parameters diff --git a/scripts/bert/data/dataloader.py b/scripts/bert/data/dataloader.py index 2bb890ffca..594955908c 100644 --- a/scripts/bert/data/dataloader.py +++ b/scripts/bert/data/dataloader.py @@ -24,7 +24,7 @@ import multiprocessing from gluonnlp.data.stream import _PathDataset -class DatasetFn(object): +class DatasetFn: """Callable object to generate a gluon.data.Dataset given a url. Subclasses should override the __call__ method. @@ -32,7 +32,7 @@ class DatasetFn(object): def __call__(self, dataset_url): raise NotImplementedError -class SamplerFn(object): +class SamplerFn: """Callable object to generate a gluon.data.sampler.Sampler given a dataset. Subclasses should override the __call__ method. @@ -40,7 +40,7 @@ class SamplerFn(object): def __call__(self, dataset): raise NotImplementedError -class DataLoaderFn(object): +class DataLoaderFn: """Callable object to generate a DataLoader object given a dataset and sampler. Subclasses should override the __call__ method. @@ -48,7 +48,7 @@ class DataLoaderFn(object): def __call__(self, dataset, sampler): raise NotImplementedError -class SimpleDataLoaderFn(object): +class SimpleDataLoaderFn: """A simple callable object that geneartes a data loader by applying dataloader_cls(dataset, batch_sampler=sampler, **dataset_params) """ @@ -77,7 +77,7 @@ def _worker_fn(url, dataset_fn, sampler_fn): sampler = sampler_fn(dataset) return (dataset, sampler) -class _MultiWorkerIter(object): +class _MultiWorkerIter: """Internal multi-worker iterator for DataLoader.""" def __init__(self, worker_pool, worker_fn, dataset, file_sampler, dataset_fn, sampler_fn, dataloader_fn, prefetch): @@ -165,7 +165,7 @@ def __iter__(self): return self -class DatasetLoader(object): +class DatasetLoader: """Loads data from a list of datasets and returns mini-batches of data. One dataset is loaded at a time. diff --git a/scripts/bert/data/ner.py b/scripts/bert/data/ner.py index 0de373db04..801d2d85c5 100644 --- a/scripts/bert/data/ner.py +++ b/scripts/bert/data/ner.py @@ -188,7 +188,7 @@ def load_segment(file_path, bert_tokenizer): return subword_sentences -class BERTTaggingDataset(object): +class BERTTaggingDataset: """ Parameters diff --git a/scripts/bert/data/qa.py b/scripts/bert/data/qa.py index 4e335c756e..27ae89397b 100644 --- a/scripts/bert/data/qa.py +++ b/scripts/bert/data/qa.py @@ -23,7 +23,7 @@ __all__ = ['SQuADTransform', 'preprocess_dataset'] -class SquadExample(object): +class SquadExample: """A single training/test example for SQuAD question. For examples without an answer, the start and end position are -1. @@ -86,7 +86,7 @@ def preprocess_dataset(dataset, transform, num_workers=8): return dataset, dataset_len -class SQuADFeature(object): +class SQuADFeature: """Single feature of a single example transform of the SQuAD question. """ @@ -120,7 +120,7 @@ def __init__(self, self.is_impossible = is_impossible -class SQuADTransform(object): +class SQuADTransform: """Dataset Transformation for BERT-style QA. The transformation is processed in the following steps: diff --git a/scripts/bert/data/transform.py b/scripts/bert/data/transform.py index 35966b017b..07acfeaea1 100644 --- a/scripts/bert/data/transform.py +++ b/scripts/bert/data/transform.py @@ -21,7 +21,7 @@ import numpy as np from gluonnlp.data import BERTSentenceTransform -class BERTDatasetTransform(object): +class BERTDatasetTransform: """Dataset transformation for BERT-style sentence classification or regression. Parameters diff --git a/scripts/bert/embedding.py b/scripts/bert/embedding.py index ad7ed41c0a..f680583a3a 100644 --- a/scripts/bert/embedding.py +++ b/scripts/bert/embedding.py @@ -55,7 +55,7 @@ def to_unicode(s): logger = logging.getLogger(__name__) -class BertEmbedding(object): +class BertEmbedding: """ Encoding from BERT model. diff --git a/scripts/bert/fp16_utils.py b/scripts/bert/fp16_utils.py index b7651adc46..ec5a15d121 100644 --- a/scripts/bert/fp16_utils.py +++ b/scripts/bert/fp16_utils.py @@ -107,7 +107,7 @@ def group_by_ctx(arr_list): return total_norm, chosen_scale, is_finite -class FP16Trainer(object): +class FP16Trainer: """ Trainer for mixed precision training. Parameters @@ -182,7 +182,7 @@ def step(self, batch_size, max_norm=None): # update scale based on overflow information self._scaler.update_scale(overflow) -class LossScaler(object): +class LossScaler: """Abstract loss scaler""" def has_overflow(self, params): """ detect inf and nan """ @@ -208,7 +208,6 @@ def __init__(self, init_scale=1): def update_scale(self, overflow): """update loss scale""" - pass class DynamicLossScaler(LossScaler): """Class that manages dynamic loss scaling. diff --git a/scripts/bert/pretraining_utils.py b/scripts/bert/pretraining_utils.py index 6c2ec1c9b2..c5965d684c 100644 --- a/scripts/bert/pretraining_utils.py +++ b/scripts/bert/pretraining_utils.py @@ -24,6 +24,7 @@ import argparse import random import multiprocessing +import functools import numpy as np @@ -258,7 +259,7 @@ def __call__(self, dataset, sampler): num_workers=self._num_ctxes) return dataloader -class BERTLoaderTransform(object): +class BERTLoaderTransform: """Create dataloader for a BERT dataset. """ def __init__(self, use_avg_len, batch_size, shuffle, num_ctxes, num_buckets): @@ -281,7 +282,8 @@ def get_pretrain_data_npz(data, batch_size, num_ctxes, shuffle, use_avg_len, 'Number of training files must be greater than the number of partitions. ' \ 'Only found %d files at %s'%(num_files, data) split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts, part_index=part_idx) - stream = nlp.data.SimpleDatasetStream(nlp.data.NumpyDataset, data, split_sampler) + NumpyDataset = functools.partial(nlp.data.NumpyDataset, allow_pickle=True) + stream = nlp.data.SimpleDatasetStream(NumpyDataset, data, split_sampler) stream = nlp.data.PrefetchingStream(stream, worker_type='process') # create data loader based on the dataset diff --git a/scripts/machine_translation/bleu.py b/scripts/machine_translation/bleu.py index a54afc526a..6312afb80e 100644 --- a/scripts/machine_translation/bleu.py +++ b/scripts/machine_translation/bleu.py @@ -110,7 +110,7 @@ def _tokenize_mteval_13a(segment): return norm -class UnicodeRegex(object): +class UnicodeRegex: """Ad-hoc hack to recognize all punctuation and symbols. """ def __init__(self): diff --git a/scripts/machine_translation/dataprocessor.py b/scripts/machine_translation/dataprocessor.py index 86f047ff8e..44a032fec5 100644 --- a/scripts/machine_translation/dataprocessor.py +++ b/scripts/machine_translation/dataprocessor.py @@ -65,7 +65,7 @@ def _load_cached_dataset(prefix): return None -class TrainValDataTransform(object): +class TrainValDataTransform: """Transform the machine translation dataset. Clip source and the target sentences to the maximum length. For the source sentence, append the @@ -176,7 +176,7 @@ def load_translation_data(dataset, bleu, args): fetch_tgt_sentence = lambda src, tgt: tgt.split() val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence)) test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence)) - elif bleu == '13a' or bleu == 'intl': + elif bleu in ('13a', 'intl'): fetch_tgt_sentence = lambda src, tgt: tgt if dataset == 'WMT2016BPE': val_text = nlp.data.WMT2016('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang) @@ -185,7 +185,7 @@ def load_translation_data(dataset, bleu, args): val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang) test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang, full=args.full) - elif dataset == 'IWSLT2015' or dataset == 'TOY': + elif dataset in ('IWSLT2015', 'TOY'): val_text = data_val test_text = data_test else: diff --git a/scripts/machine_translation/translation.py b/scripts/machine_translation/translation.py index 8cb4271cf3..b3d0e97e00 100644 --- a/scripts/machine_translation/translation.py +++ b/scripts/machine_translation/translation.py @@ -25,7 +25,7 @@ import mxnet as mx from gluonnlp.model import BeamSearchScorer, BeamSearchSampler -class BeamSearchTranslator(object): +class BeamSearchTranslator: """Beam Search Translator Parameters diff --git a/scripts/parsing/common/data.py b/scripts/parsing/common/data.py index 2032ebbff5..fb9bbbd3f6 100644 --- a/scripts/parsing/common/data.py +++ b/scripts/parsing/common/data.py @@ -27,7 +27,7 @@ from .savable import Savable -class ConllWord(object): +class ConllWord: """CoNLL format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf Parameters @@ -76,7 +76,7 @@ def __str__(self): return '\t'.join(['_' if v is None else v for v in values]) -class ConllSentence(object): +class ConllSentence: """A list of ConllWord Parameters @@ -365,7 +365,7 @@ def rel_size(self): return len(self._id2rel) -class DataLoader(object): +class DataLoader: """ Load CoNLL data Adopted from https://github.com/jcyk/Dynet-Biaffine-dependency-parser with some modifications diff --git a/scripts/parsing/common/k_means.py b/scripts/parsing/common/k_means.py index 2d8d134ec4..632cfe3481 100755 --- a/scripts/parsing/common/k_means.py +++ b/scripts/parsing/common/k_means.py @@ -21,7 +21,7 @@ import numpy as np -class KMeans(object): +class KMeans: """ Cluster sentences by their lengths @@ -101,7 +101,6 @@ def __init__(self, k, len_cntr): # print('%d) Final splits: %s; Final mass: %d' % (i, self._splits, self.get_mass())) self._reindex() - return def _recenter(self): """ diff --git a/scripts/parsing/common/savable.py b/scripts/parsing/common/savable.py index 29179f1b8c..52bf88814c 100644 --- a/scripts/parsing/common/savable.py +++ b/scripts/parsing/common/savable.py @@ -21,7 +21,7 @@ import pickle -class Savable(object): +class Savable: """ A super class for save/load operations. """ diff --git a/scripts/parsing/common/tarjan.py b/scripts/parsing/common/tarjan.py index 9fe296d715..75a98c6f33 100755 --- a/scripts/parsing/common/tarjan.py +++ b/scripts/parsing/common/tarjan.py @@ -78,7 +78,6 @@ def strongconnect(self, v, index, stack): w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) - return # ====================== @property diff --git a/scripts/parsing/common/utils.py b/scripts/parsing/common/utils.py index 749502ae67..6d94493112 100644 --- a/scripts/parsing/common/utils.py +++ b/scripts/parsing/common/utils.py @@ -33,7 +33,7 @@ from .tarjan import Tarjan -class Progbar(object): +class Progbar: """Progbar class copied from keras (https://github.com/fchollet/keras/) Displays a progress bar. diff --git a/scripts/parsing/parser/dep_parser.py b/scripts/parsing/parser/dep_parser.py index 053aea3c98..73a9b5d3ac 100644 --- a/scripts/parsing/parser/dep_parser.py +++ b/scripts/parsing/parser/dep_parser.py @@ -33,7 +33,7 @@ from scripts.parsing.parser.evaluate import evaluate_official_script -class DepParser(object): +class DepParser: """User interfaces for biaffine dependency parser. It wraps a biaffine model inside, provides training, evaluating and parsing. diff --git a/scripts/question_answering/data_pipeline.py b/scripts/question_answering/data_pipeline.py index b454333569..0cd0b8b6ba 100644 --- a/scripts/question_answering/data_pipeline.py +++ b/scripts/question_answering/data_pipeline.py @@ -38,7 +38,7 @@ from gluonnlp.data import SQuAD -class SQuADDataPipeline(object): +class SQuADDataPipeline: """Main data processing pipeline class, which encapsulate all preprocessing logic. The class process the data in multiprocessing mode using Pool. It can save/load the result of processing, but since it happens in a single thread, it is usually faster to just process data from scratch. @@ -432,7 +432,7 @@ def _partition(mapped_values): return partitioned_data.items() -class SQuADDataTokenizer(object): +class SQuADDataTokenizer: """SQuAD data tokenizer, that encapsulate the splitting logic of each entry of SQuAD dataset""" spacy_tokenizer = nlp.data.SpacyTokenizer() @@ -574,7 +574,7 @@ def _get_token_spans(text, tokens): return spans -class SQuADDataFilter(object): +class SQuADDataFilter: """Filter an example based on the specified conditions""" def __init__(self, para_limit, ques_limit, ans_limit): @@ -612,7 +612,7 @@ def filter(self, example): (example['y2s'][0] - example['y1s'][0]) <= self._ans_limit -class SQuADAsyncVocabMapper(object): +class SQuADAsyncVocabMapper: """A multiprocessing implementation of a Mapper for tokens counting""" def __init__(self, iterate_over_example=False): @@ -663,7 +663,7 @@ def __call__(self, example): return list(counter.items()) -class SQuADAsyncVocabReducer(object): +class SQuADAsyncVocabReducer: """A multiprocessing implementation of a Reducing for tokens counting""" def run_async(self, items, pool): @@ -701,7 +701,7 @@ def __call__(self, item): return token, sum(counts) -class SQuADDataFeaturizer(object): +class SQuADDataFeaturizer: """Class that converts tokenized examples into featurized""" def __init__(self, word_vocab, char_vocab, para_limit, ques_limit, char_limit, @@ -892,7 +892,7 @@ def get_record_by_idx(self, rec_idx): return self._record_idx_to_record[rec_idx]['rec'] -class SQuADDataLoaderTransformer(object): +class SQuADDataLoaderTransformer: """Thin wrapper on SQuADQADataset that removed non-numeric values from the record. The output of that transformer can be provided to a DataLoader""" diff --git a/scripts/sentiment_analysis/process_data.py b/scripts/sentiment_analysis/process_data.py index 962935b073..dd9ec892b7 100644 --- a/scripts/sentiment_analysis/process_data.py +++ b/scripts/sentiment_analysis/process_data.py @@ -61,7 +61,7 @@ def _load_file(data_name): def _clean_str(string, data_name): - if data_name == 'SST-1' or data_name == 'SST-2': + if data_name in ('SST-1', 'SST-2'): string = re.sub(r'[^A-Za-z0-9(),!?\'\`]', ' ', string) string = re.sub(r'\s{2,}', ' ', string) return string.strip().lower() @@ -122,7 +122,7 @@ def _preprocess_dataset(dataset, vocab, max_len): def load_dataset(data_name): """Load sentiment dataset.""" - if data_name == 'MR' or data_name == 'Subj' or data_name == 'CR' or data_name == 'MPQA': + if data_name in ('MR', 'Subj', 'CR', 'MPQA'): train_dataset, output_size = _load_file(data_name) vocab, max_len = _build_vocab(data_name, train_dataset, [], []) train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len) diff --git a/scripts/sentiment_analysis/text_cnn.py b/scripts/sentiment_analysis/text_cnn.py index 662ec44f77..96098890ae 100644 --- a/scripts/sentiment_analysis/text_cnn.py +++ b/scripts/sentiment_analysis/text_cnn.py @@ -71,7 +71,7 @@ def init(textCNN, vocab, model_mode, context): textCNN.embedding.weight.set_data(vocab.embedding.idx_to_vec) if model_mode == 'multichannel': textCNN.embedding_extend.weight.set_data(vocab.embedding.idx_to_vec) - if model_mode == 'static' or model_mode == 'multichannel': + if model_mode in ('static', 'multichannel'): # Parameters of textCNN.embedding are not updated during training. textCNN.embedding.collect_params().setattr('grad_req', 'null') trainer = gluon.Trainer(textCNN.collect_params(), 'adadelta', {'rho':0.95, 'clip_gradient':3}) diff --git a/scripts/text_generation/sequence_sampling.py b/scripts/text_generation/sequence_sampling.py index 2039c77d03..2373f20767 100644 --- a/scripts/text_generation/sequence_sampling.py +++ b/scripts/text_generation/sequence_sampling.py @@ -86,7 +86,7 @@ # Define the decoder function, we use log_softmax to map the output scores to log-likelihoods # Also, we transform the layout to NTC -class LMDecoder(object): +class LMDecoder: def __init__(self, net): self.net = net diff --git a/src/gluonnlp/base.py b/src/gluonnlp/base.py index 7196175699..a34c55a24d 100644 --- a/src/gluonnlp/base.py +++ b/src/gluonnlp/base.py @@ -45,9 +45,9 @@ def numba_jitclass(spec): # pylint: disable=unused-argument return identity - class NumbaTypes(object): + class NumbaTypes: """Shim for numba.types""" - class NumbaType(object): + class NumbaType: """Shim for numba.types.type""" def __getitem__(self, x): # pylint: disable=unused-argument diff --git a/src/gluonnlp/data/batchify/batchify.py b/src/gluonnlp/data/batchify/batchify.py index dad5e5e118..c18728224d 100644 --- a/src/gluonnlp/data/batchify/batchify.py +++ b/src/gluonnlp/data/batchify/batchify.py @@ -93,7 +93,7 @@ def _stack_arrs(arrs, use_shared_mem, dtype): return mx.nd.array(out, dtype=dtype) -class Stack(object): +class Stack: r"""Stack the input data samples to construct the batch. The N input samples must have the same shape/length and will be stacked to construct a batch. @@ -157,7 +157,7 @@ def __call__(self, data): return _stack_arrs(data, True, self._dtype) -class Pad(object): +class Pad: """Return a callable that pads and stacks data. Parameters @@ -275,7 +275,7 @@ def __call__(self, data): raise NotImplementedError -class Tuple(object): +class Tuple: """Wrap multiple batchify functions together. The input functions will be applied to the corresponding input fields. @@ -342,7 +342,7 @@ def __call__(self, data): ret.append(ele_fn([ele[i] for ele in data])) return tuple(ret) -class List(object): +class List: """Simply forward the list of input data. This is particularly useful when the Dataset contains textual data diff --git a/src/gluonnlp/data/batchify/embedding.py b/src/gluonnlp/data/batchify/embedding.py index bd6d247616..22128cfdc5 100644 --- a/src/gluonnlp/data/batchify/embedding.py +++ b/src/gluonnlp/data/batchify/embedding.py @@ -39,7 +39,7 @@ def numba_njit(func): return func -class EmbeddingCenterContextBatchify(object): +class EmbeddingCenterContextBatchify: """Helper to create batches of center and contexts words. Batches are created lazily on a optionally shuffled version of the Dataset. diff --git a/src/gluonnlp/data/batchify/language_model.py b/src/gluonnlp/data/batchify/language_model.py index f908b1f6ef..8e3a4286cd 100644 --- a/src/gluonnlp/data/batchify/language_model.py +++ b/src/gluonnlp/data/batchify/language_model.py @@ -32,7 +32,7 @@ from ..utils import slice_sequence, _slice_pad_length from ..stream import DataStream -class CorpusBatchify(object): +class CorpusBatchify: """Transform the dataset into N independent sequences, where N is the batch size. Parameters @@ -70,7 +70,7 @@ def __call__(self, data): self._batch_size, -1).T) -class CorpusBPTTBatchify(object): +class CorpusBPTTBatchify: """Transform the dataset into batches of numericalized samples, in the way that the recurrent states from last batch connects with the current batch for each sample. @@ -154,7 +154,7 @@ def _split_data_label(x): return x[:-1, :], x[1:, :] -class StreamBPTTBatchify(object): +class StreamBPTTBatchify: """Transform a Stream of CorpusDataset to BPTT batches. The corpus is transformed into batches of numericalized samples, in the way that the diff --git a/src/gluonnlp/data/dataloader.py b/src/gluonnlp/data/dataloader.py index d8c83eaad6..edb213af45 100644 --- a/src/gluonnlp/data/dataloader.py +++ b/src/gluonnlp/data/dataloader.py @@ -59,7 +59,7 @@ def _thread_worker_fn(samples, batchify_fn, dataset): batch = batchify_fn([dataset[i] for i in samples]) return batch -class _MultiWorkerIter(object): +class _MultiWorkerIter: """Internal multi-worker iterator for DataLoader.""" def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False, worker_fn=_worker_fn, prefetch=0, dataset=None): @@ -112,7 +112,7 @@ def __iter__(self): return self -class ShardedDataLoader(object): +class ShardedDataLoader: """Loads data from a dataset and returns mini-batches of data. Parameters diff --git a/src/gluonnlp/data/dataset.py b/src/gluonnlp/data/dataset.py index cab6601ea3..6a67557f0d 100644 --- a/src/gluonnlp/data/dataset.py +++ b/src/gluonnlp/data/dataset.py @@ -253,14 +253,16 @@ class NumpyDataset(ArrayDataset): ---------- filename : str Path to the .npy or .npz file. + kwargs + Keyword arguments are passed to np.load. Properties ---------- keys: list of str or None The list of keys loaded from the .npz file. """ - def __init__(self, filename): - arrs = np.load(filename) + def __init__(self, filename, **kwargs): + arrs = np.load(filename, **kwargs) keys = None data = [] if filename.endswith('.npy'): diff --git a/src/gluonnlp/data/glue.py b/src/gluonnlp/data/glue.py index 8cfbc170d8..c7408da050 100644 --- a/src/gluonnlp/data/glue.py +++ b/src/gluonnlp/data/glue.py @@ -657,7 +657,7 @@ def _generate(self, segment): download(self._repo_dir() + raw_name, path=raw_path, sha1_hash=raw_hash) data_path = os.path.join(self._root, data_name) - if segment == 'train' or segment == 'dev': + if segment in ('train', 'dev'): if os.path.isfile(data_path) and check_sha1(data_path, data_hash): return diff --git a/src/gluonnlp/data/sampler.py b/src/gluonnlp/data/sampler.py index 73d008d0ef..6f839c71ec 100644 --- a/src/gluonnlp/data/sampler.py +++ b/src/gluonnlp/data/sampler.py @@ -71,7 +71,7 @@ def _bucket_stats(bucket_sample_ids, seq_lengths): return (bucket_average_lengths, bucket_length_stds) -class BucketScheme(object): +class BucketScheme: r"""Base class for generating bucket keys.""" def __call__(self, max_lengths, min_lengths, num_buckets): """Generate bucket keys based on the lengths of sequences and number of buckets. diff --git a/src/gluonnlp/data/stream.py b/src/gluonnlp/data/stream.py index ffb847d1fb..68150c207e 100644 --- a/src/gluonnlp/data/stream.py +++ b/src/gluonnlp/data/stream.py @@ -47,7 +47,7 @@ 'DataStream', 'SimpleDataStream', 'DatasetStream', 'SimpleDatasetStream', 'PrefetchingStream'] -class DataStream(object): +class DataStream: """Abstract Data Stream Interface. DataStreams are useful to avoid loading big datasets to memory. A @@ -232,7 +232,7 @@ def __iter__(self): yield self._dataset(filename, **self._kwargs) -class _Prefetcher(object): +class _Prefetcher: """Internal shared prefetcher logic.""" _dataq = None # Data queue transmits prefetched elements _controlq = None # Control queue to instruct thread / process shutdown @@ -303,8 +303,7 @@ def __next__(self): self._controlq.put(None) if isinstance(next_error[0], StopIteration): raise StopIteration - else: - return self._reraise(*next_error) + return self._reraise(*next_error) def _reraise(self, e, tb): print('Reraising exception from Prefetcher', file=sys.stderr) diff --git a/src/gluonnlp/data/transforms.py b/src/gluonnlp/data/transforms.py index edacadcbfd..fbf6bec31c 100644 --- a/src/gluonnlp/data/transforms.py +++ b/src/gluonnlp/data/transforms.py @@ -49,7 +49,7 @@ from ..base import get_home_dir -class ClipSequence(object): +class ClipSequence: """Clip the sequence to have length no more than `length`. Parameters @@ -82,7 +82,7 @@ def __call__(self, sample): return sample[:min(len(sample), self._length)] -class PadSequence(object): +class PadSequence: """Pad the sequence. Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set, @@ -157,7 +157,7 @@ def __call__(self, sample): 'mxnet.NDArray, received type=%s' % str(type(sample))) -class NLTKMosesTokenizer(object): +class NLTKMosesTokenizer: """Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to install `NLTK `_ @@ -219,7 +219,7 @@ def __call__(self, sample, return_str=False): return self._tokenizer.tokenize(sample, return_str=return_str) -class SacreMosesTokenizer(object): +class SacreMosesTokenizer: """Apply the Moses Tokenizer implemented in sacremoses. Users of this class are required to install @@ -285,7 +285,7 @@ def __call__(self, sample, return_str=False): return self._tokenizer.tokenize(sample, return_str=return_str) -class SpacyTokenizer(object): +class SpacyTokenizer: """Apply the Spacy Tokenizer. Users of this class are required to install `spaCy `_ @@ -348,7 +348,7 @@ def __call__(self, sample): return [tok.text for tok in self._nlp(sample)] -class NLTKMosesDetokenizer(object): +class NLTKMosesDetokenizer: r"""Apply the Moses Detokenizer implemented in NLTK. Users of this class are required to `install NLTK `_ @@ -410,7 +410,7 @@ def __call__(self, sample, return_str=False): return self._detokenizer.detokenize(sample, return_str=return_str) -class SacreMosesDetokenizer(object): +class SacreMosesDetokenizer: r"""Apply the Moses Detokenizer implemented in sacremoses. Users of this class are required to `install sacremoses @@ -490,7 +490,7 @@ def __call__(self, sample, return_str=None): return self._detokenizer.detokenize(sample, return_str=ret_str) -class JiebaTokenizer(object): +class JiebaTokenizer: r"""Apply the jieba Tokenizer. Users of this class are required to `install jieba `_ @@ -537,11 +537,11 @@ def __call__(self, sample): # we use default cutting mode provided by jieba, i.e., accurate mode return [ tok for tok in self._tokenizer.cut(sample) - if tok != ' ' and tok != '' + if tok not in (' ', '') ] -class NLTKStanfordSegmenter(object): +class NLTKStanfordSegmenter: r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK. Users of this class are required to install Java, NLTK and download Stanford Word Segmenter @@ -651,7 +651,7 @@ def __call__(self, sample): return [tok for tok in self._tokenizer.segment(sample).strip().split()] -class _SentencepieceProcessor(object): +class _SentencepieceProcessor: def __init__(self, path): try: import sentencepiece @@ -774,7 +774,7 @@ def __call__(self, sample): return self._processor.DecodePieces(sample) -class BERTBasicTokenizer(object): +class BERTBasicTokenizer: r"""Runs basic tokenization performs invalid character removal (e.g. control chars) and whitespace. @@ -885,13 +885,10 @@ def _is_chinese_char(self, cp): # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) - or (cp >= 0x20000 and cp <= 0x2A6DF) - or (cp >= 0x2A700 and cp <= 0x2B73F) - or (cp >= 0x2B740 and cp <= 0x2B81F) - or (cp >= 0x2B820 and cp <= 0x2CEAF) - or (cp >= 0xF900 and cp <= 0xFAFF) - or (cp >= 0x2F800 and cp <= 0x2FA1F)): + if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF) or (0x20000 <= cp <= 0x2A6DF) + or (0x2A700 <= cp <= 0x2B73F) or (0x2B740 <= cp <= 0x2B81F) + or (0x2B820 <= cp <= 0x2CEAF) or (0xF900 <= cp <= 0xFAFF) + or (0x2F800 <= cp <= 0x2FA1F)): return True return False @@ -934,10 +931,10 @@ def _is_punctuation(self, char): # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. - group0 = cp >= 33 and cp <= 47 - group1 = cp >= 58 and cp <= 64 - group2 = cp >= 91 and cp <= 96 - group3 = cp >= 123 and cp <= 126 + group0 = 33 <= cp <= 47 + group1 = 58 <= cp <= 64 + group2 = 91 <= cp <= 96 + group3 = 123 <= cp <= 126 if (group0 or group1 or group2 or group3): return True cat = unicodedata.category(char) @@ -963,7 +960,7 @@ def _whitespace_tokenize(self, text): return tokens -class BERTTokenizer(object): +class BERTTokenizer: r"""End-to-end tokenization for BERT models. Parameters @@ -1220,7 +1217,7 @@ def is_first_subword(token): return token.startswith(BERTSPTokenizer._special_prefix) -class BERTSentenceTransform(object): +class BERTSentenceTransform: r"""BERT style data transformation. Parameters @@ -1366,7 +1363,7 @@ def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): else: tokens_b.pop() -class _GPT2BPE(object): +class _GPT2BPE: """Base class for GPT-2 BPE tokenizer and detokenizer.""" def __init__(self): codes = list(range(ord(u'!'), ord(u'~') + 1)) +\ diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py index d403473bfa..925c2c3991 100644 --- a/src/gluonnlp/data/utils.py +++ b/src/gluonnlp/data/utils.py @@ -406,7 +406,7 @@ def whitespace_splitter(s): return s.split() -class Splitter(object): +class Splitter: """Split a string based on a separator. Parameters diff --git a/src/gluonnlp/embedding/evaluation.py b/src/gluonnlp/embedding/evaluation.py index cf689b8465..47a47d0786 100644 --- a/src/gluonnlp/embedding/evaluation.py +++ b/src/gluonnlp/embedding/evaluation.py @@ -32,12 +32,10 @@ class _WordEmbeddingEvaluationFunction(HybridBlock): # pylint: disable=abstract-method """Base class for word embedding evaluation functions.""" - pass class WordEmbeddingSimilarityFunction(_WordEmbeddingEvaluationFunction): # pylint: disable=abstract-method """Base class for word embedding similarity functions.""" - pass class WordEmbeddingAnalogyFunction(_WordEmbeddingEvaluationFunction): # pylint: disable=abstract-method @@ -52,7 +50,6 @@ class WordEmbeddingAnalogyFunction(_WordEmbeddingEvaluationFunction): # pylint: eps : float, optional, default=1e-10 A small constant for numerical stability. """ - pass ############################################################################### diff --git a/src/gluonnlp/embedding/token_embedding.py b/src/gluonnlp/embedding/token_embedding.py index 1b2fd9eb95..5adbc697d9 100644 --- a/src/gluonnlp/embedding/token_embedding.py +++ b/src/gluonnlp/embedding/token_embedding.py @@ -142,7 +142,7 @@ def list_sources(embedding_name=None): for embedding_name, embedding_cls in registry.get_registry(TokenEmbedding).items()} -class TokenEmbedding(object): +class TokenEmbedding: """Token embedding base class. To load token embedding from an externally hosted pre-trained token embedding file, such as @@ -724,10 +724,9 @@ def __setitem__(self, tokens, new_embedding): ' unknown token, please explicitly include "{}" as the ' '`unknown_token` in `tokens`. This is to avoid unintended ' 'updates.').format(token, self.unknown_token)) - else: - raise KeyError(('Token "{}" is unknown. Updating the embedding vector for an ' - 'unknown token is not allowed because `unknown_token` is not ' - 'specified.').format(token)) + raise KeyError(('Token "{}" is unknown. Updating the embedding vector for an ' + 'unknown token is not allowed because `unknown_token` is not ' + 'specified.').format(token)) self._idx_to_vec[nd.array(indices)] = new_embedding @@ -743,10 +742,10 @@ def _check_source(cls, source_file_hash, source): """ embedding_name = cls.__name__.lower() if source not in source_file_hash: - raise KeyError('Cannot find pre-trained source {} for token embedding {}. ' - 'Valid pre-trained file names for embedding {}: {}'.format( - source, embedding_name, embedding_name, - ', '.join(source_file_hash.keys()))) + raise KeyError('Cannot find pre-trained source {source} for token embedding {name}. ' + 'Valid pre-trained file names for embedding {name}: {values}'.format( + source=source, name=embedding_name, + values=', '.join(source_file_hash.keys()))) @staticmethod def from_file(file_path, elem_delim=' ', encoding=ENCODING, **kwargs): diff --git a/src/gluonnlp/metric/masked_accuracy.py b/src/gluonnlp/metric/masked_accuracy.py index 65fc446777..31d3fea708 100644 --- a/src/gluonnlp/metric/masked_accuracy.py +++ b/src/gluonnlp/metric/masked_accuracy.py @@ -23,7 +23,7 @@ __all__ = ['EvalMetric', 'MaskedAccuracy'] -class EvalMetric(object): +class EvalMetric: """Base class for all evaluation metrics. .. note:: diff --git a/src/gluonnlp/model/block.py b/src/gluonnlp/model/block.py index ef42c66a83..2b6d83780e 100644 --- a/src/gluonnlp/model/block.py +++ b/src/gluonnlp/model/block.py @@ -39,7 +39,7 @@ class RNNCellLayer(Block): def __init__(self, rnn_cell, layout='TNC', **kwargs): super(RNNCellLayer, self).__init__(**kwargs) self.cell = rnn_cell - assert layout == 'TNC' or layout == 'NTC', \ + assert layout in ('TNC', 'NTC'), \ 'Invalid layout %s; must be one of ["TNC" or "NTC"]'%layout self._layout = layout self._axis = layout.find('T') @@ -109,7 +109,7 @@ def __init__(self, **kwargs): super(GELU, self).__init__(**kwargs) self._support_erf = False try: - self._support_erf = True if ndarray.erf else False + self._support_erf = bool(ndarray.erf) except AttributeError: warnings.warn('`erf` operator support is not found. ' 'Please consider upgrading to mxnet >= 1.4') diff --git a/src/gluonnlp/model/sequence_sampler.py b/src/gluonnlp/model/sequence_sampler.py index a233a4b5db..a60a2be6e6 100644 --- a/src/gluonnlp/model/sequence_sampler.py +++ b/src/gluonnlp/model/sequence_sampler.py @@ -448,7 +448,7 @@ def hybrid_forward(self, F, samples, valid_length, outputs, scores, beam_alive_m chosen_word_ids, beam_alive_mask, new_states -class BeamSearchSampler(object): +class BeamSearchSampler: r"""Draw samples from the decoder by beam search. Parameters @@ -715,7 +715,7 @@ def _else_func(): F.contrib.cond(F.sum(new_beam_alive_mask) == 0, _then_func, _else_func) return new_samples, new_scores, new_new_valid_length -class SequenceSampler(object): +class SequenceSampler: r"""Draw samples from the decoder according to the step-wise distribution. Parameters diff --git a/src/gluonnlp/utils/files.py b/src/gluonnlp/utils/files.py index 3cca21556c..b2eb1af301 100644 --- a/src/gluonnlp/utils/files.py +++ b/src/gluonnlp/utils/files.py @@ -68,7 +68,7 @@ def mkdir(dirname): if e.errno != 17: raise e -class _TempFilePath(object): +class _TempFilePath: """A TempFilePath that provides a path to a temporarily file, and automatically cleans up the temp file at exit. """ diff --git a/src/gluonnlp/utils/parallel.py b/src/gluonnlp/utils/parallel.py index 62799b383b..edcaba7ea2 100644 --- a/src/gluonnlp/utils/parallel.py +++ b/src/gluonnlp/utils/parallel.py @@ -25,7 +25,7 @@ __all__ = ['Parallelizable', 'Parallel'] -class Parallelizable(object): +class Parallelizable: """Base class for parallelizable unit of work, which can be invoked by `Parallel`. The subclass must implement the `forward_backward` method, and be used together with `Parallel`. For example:: @@ -59,7 +59,7 @@ def forward_backward(self, x): """ Forward and backward computation. """ raise NotImplementedError() -class Parallel(object): +class Parallel: """Class for parallel processing with `Parallelizable`s. It invokes a `Parallelizable` with multiple Python threads. For example:: @@ -100,7 +100,7 @@ def forward_backward(self, x): multiple threads may cause unexpected behavior. """ - class _StopSignal(object): + class _StopSignal: """Internal class to signal stop. """ def __init__(self, msg): self._msg = msg diff --git a/src/gluonnlp/utils/parameter.py b/src/gluonnlp/utils/parameter.py index 44abf8f0bf..8dac90f940 100644 --- a/src/gluonnlp/utils/parameter.py +++ b/src/gluonnlp/utils/parameter.py @@ -144,10 +144,9 @@ def load_parameters(model, filename, ctx=None, allow_missing=False, if cast_dtype is not None: if mx.__version__ < '1.5.0': raise NotImplementedError('cast_dtype option requires MXNet 1.5.0') - else: - _s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx, - allow_missing=allow_missing, ignore_extra=ignore_extra, - cast_dtype=cast_dtype) + _s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx, + allow_missing=allow_missing, ignore_extra=ignore_extra, + cast_dtype=cast_dtype) else: _s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx, allow_missing=allow_missing, ignore_extra=ignore_extra) diff --git a/src/gluonnlp/vocab/elmo.py b/src/gluonnlp/vocab/elmo.py index b30af8aae3..c0d29f21d2 100644 --- a/src/gluonnlp/vocab/elmo.py +++ b/src/gluonnlp/vocab/elmo.py @@ -22,7 +22,7 @@ __all__ = ['ELMoCharVocab'] -class ELMoCharVocab(object): +class ELMoCharVocab: r"""ELMo special character vocabulary The vocab aims to map individual tokens to sequences of character ids, compatible with ELMo. diff --git a/src/gluonnlp/vocab/subwords.py b/src/gluonnlp/vocab/subwords.py index 2203a66cb4..b8647a0d06 100644 --- a/src/gluonnlp/vocab/subwords.py +++ b/src/gluonnlp/vocab/subwords.py @@ -53,7 +53,7 @@ def list_subword_functions(): return list(reg.keys()) -class SubwordFunction(object): +class SubwordFunction: """A SubwordFunction maps words to lists of subword indices. This class is abstract and to be subclassed. Use diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py index 61947b5a04..1d6450a964 100644 --- a/src/gluonnlp/vocab/vocab.py +++ b/src/gluonnlp/vocab/vocab.py @@ -39,7 +39,7 @@ UNK_IDX = 0 -class Vocab(object): +class Vocab: """Indexing and embedding attachment for text tokens. Parameters diff --git a/tests/unittest/test_token_embedding.py b/tests/unittest/test_token_embedding.py index eb82c65dcc..baf92b97fe 100644 --- a/tests/unittest/test_token_embedding.py +++ b/tests/unittest/test_token_embedding.py @@ -27,7 +27,7 @@ from gluonnlp.base import _str_types -class NaiveUnknownLookup(object): +class NaiveUnknownLookup: def __init__(self, embsize): self.embsize = embsize diff --git a/tests/unittest/test_vocab_embed.py b/tests/unittest/test_vocab_embed.py index b9dd92badc..e26cd9f04c 100644 --- a/tests/unittest/test_vocab_embed.py +++ b/tests/unittest/test_vocab_embed.py @@ -761,7 +761,7 @@ def test_vocab_set_embedding_with_subword_lookup_only_token_embedding( allow_extend, unknown_token, vocab_unknown_token, initialize): embsize = 5 - class NaiveLookup(object): + class NaiveLookup: def __contains__(self, token): return True @@ -915,7 +915,7 @@ def test_token_embedding_from_S3_fasttext_with_ngrams(load_ngrams): def test_token_embedding_unknown_lookup(setinconstructor, lookup, initializetokenembedding, unknown_token, allow_extend, tmpdir): - class NaiveLookup(object): + class NaiveLookup: dim = 5 # Must match _mk_my_pretrain_file def __contains__(self, token): @@ -927,7 +927,7 @@ def __getitem__(self, tokens): else: return nd.ones((len(tokens), self.dim)) - class IncapableLookup(object): + class IncapableLookup: def __contains__(self, token): return False @@ -1090,7 +1090,7 @@ def test_word_embedding_evaluation_registry(): with pytest.raises(RuntimeError): @nlp.embedding.evaluation.register - class InvalidEvaluationFunction(object): + class InvalidEvaluationFunction: pass with pytest.raises(KeyError):