Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
[REFACTOR] Improve readability (#975)
Browse files Browse the repository at this point in the history
* Drop unneeded u'' string unicode qualifiers

Unicode is default with Python 3. We dropped Python 2 support.

* Fix IWSLT2015 support in train_transformer.py

* Simplify BaseTransformerEncoder

* Use contrib.arange_like

* Skip SemEval17Task2 test as website is down

* Disable SemEval17Task2 doctest

* Fix
  • Loading branch information
leezu authored and sxjscience committed Oct 18, 2019
1 parent f512424 commit 02056ea
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 62 deletions.
6 changes: 3 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
release = nlp.__version__

# General information about the project.
project = u'gluonnlp'
author = u'%s developers' % project
copyright = u'2018, %s' % author
project = 'gluonnlp'
author = '%s developers' % project
copyright = '2019, %s' % author
github_doc_root = 'http://gluon-nlp.mxnet.io/{}/'.format(str(version))

# add markdown parser
Expand Down
4 changes: 2 additions & 2 deletions scripts/machine_translation/dataprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,6 @@ def write_sentences(sentences, file_path):
with io.open(file_path, 'w', encoding='utf-8') as of:
for sent in sentences:
if isinstance(sent, (list, tuple)):
of.write(u' '.join(sent) + u'\n')
of.write(' '.join(sent) + '\n')
else:
of.write(sent + u'\n')
of.write(sent + '\n')
13 changes: 8 additions & 5 deletions scripts/machine_translation/train_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@

nlp.utils.check_version('0.7.0')

parser = argparse.ArgumentParser(description='Neural Machine Translation Example.'
'We train the Transformer Model')
parser.add_argument('--dataset', type=str, default='WMT2016BPE', help='Dataset to use.')
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Neural Machine Translation Example with the Transformer Model.')
parser.add_argument('--dataset', type=str.upper, default='WMT2016BPE', help='Dataset to use.',
choices=['IWSLT2015', 'WMT2016BPE', 'WMT2014BPE', 'TOY'])
parser.add_argument('--src_lang', type=str, default='en', help='Source language')
parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
parser.add_argument('--epochs', type=int, default=10, help='upper epoch limit')
Expand Down Expand Up @@ -181,8 +183,9 @@
max_tgt_length=max(tgt_max_len, 500),
scaled=args.scaled)
model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
share_embed=args.dataset != 'TOY', embed_size=args.num_units,
tie_weights=args.dataset != 'TOY', embed_initializer=None, prefix='transformer_')
share_embed=args.dataset not in ('TOY', 'IWSLT2015'), embed_size=args.num_units,
tie_weights=args.dataset not in ('TOY', 'IWSLT2015'), embed_initializer=None,
prefix='transformer_')
model.initialize(init=mx.init.Xavier(magnitude=args.magnitude), ctx=ctx)
static_alloc = True
model.hybridize(static_alloc=static_alloc)
Expand Down
4 changes: 2 additions & 2 deletions src/gluonnlp/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ class TSVDataset(SimpleDataset):
# b\tLaoban\tZha
# discard the first line and select the 0th and 2nd fields
dataset = data.TSVDataset('test.tsv', num_discard_samples=1, field_indices=[0, 2])
assert dataset[0] == [u'a', u'Jiang']
assert dataset[1] == [u'b', u'Zha']
assert dataset[0] == ['a', 'Jiang']
assert dataset[1] == ['b', 'Zha']
Parameters
----------
Expand Down
6 changes: 3 additions & 3 deletions src/gluonnlp/data/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ def _generate(self, segment):
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
example = u'%s\t%s\t%s\t%s\t%s\n'%(label, id1, id2, s1, s2)
example = '%s\t%s\t%s\t%s\t%s\n'%(label, id1, id2, s1, s2)
if [id1, id2] in dev_ids:
dev_fh.write(example)
else:
Expand All @@ -695,7 +695,7 @@ def _generate(self, segment):
with io.open(raw_path, encoding='utf8') as data_fh:
with io.open(data_path, 'w', encoding='utf8') as test_fh:
header = data_fh.readline()
test_fh.write(u'index\t#1 ID\t#2 ID\t#1 String\t#2 String\n')
test_fh.write('index\t#1 ID\t#2 ID\t#1 String\t#2 String\n')
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write(u'%d\t%s\t%s\t%s\t%s\n'%(idx, id1, id2, s1, s2))
test_fh.write('%d\t%s\t%s\t%s\t%s\n'%(idx, id1, id2, s1, s2))
10 changes: 5 additions & 5 deletions src/gluonnlp/data/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ class BERTTokenizer:
"""

_special_prefix = u'##'
_special_prefix = '##'

def __init__(self, vocab, lower=True, max_input_chars_per_word=200):
self.vocab = vocab
Expand Down Expand Up @@ -967,7 +967,7 @@ class BERTSPTokenizer(BERTTokenizer):
['▁better', '▁is', '▁to', '▁b', 'ow', '▁than', '▁brea', 'k', '▁', '.']
"""

_special_prefix = u'▁'
_special_prefix = '▁'

def __init__(self,
path,
Expand Down Expand Up @@ -1206,9 +1206,9 @@ def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
class _GPT2BPE:
"""Base class for GPT-2 BPE tokenizer and detokenizer."""
def __init__(self):
codes = list(range(ord(u'!'), ord(u'~') + 1)) +\
list(range(ord(u'¡'), ord(u'¬') + 1)) +\
list(range(ord(u'®'), ord(u'ÿ') + 1))
codes = list(range(ord('!'), ord('~') + 1)) +\
list(range(ord('¡'), ord('¬') + 1)) +\
list(range(ord('®'), ord('ÿ') + 1))
chr_fn = chr
try:
chr_fn(256)
Expand Down
7 changes: 4 additions & 3 deletions src/gluonnlp/data/word_embedding_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,13 +540,14 @@ class SemEval17Task2(WordSimilarityEvaluationDataset):
Examples
--------
>>> semeval17task2 = gluonnlp.data.SemEval17Task2(root='./datasets/semeval17task2')
>>> semeval17task2 = gluonnlp.data.SemEval17Task2() # doctest: +SKIP
-etc-
>>> len(semeval17task2)
>>> len(semeval17task2) # doctest: +SKIP
18
>>> semeval17task2[0]
>>> semeval17task2[0] # doctest: +SKIP
['sunset', 'string', 0.05]
"""
# TODO: reenable doctest once semeval17task2 is available again
_url = 'http://alt.qcri.org/semeval2017/task2/data/uploads/semeval2017-task2.zip'
_archive_file = ('semeval2017-task2.zip',
'b29860553f98b057303815817dfb60b9fe79cfba')
Expand Down
14 changes: 1 addition & 13 deletions src/gluonnlp/model/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,18 +479,6 @@ def _apply_pooling(self, sequence):
outputs = outputs.reshape(shape=(-1, self._units))
return self.pooler(outputs)

def _arange_like(self, F, inputs):
"""Helper function to generate int32 indices of a range"""
inputs = inputs.reshape(-1)
if F == mx.ndarray:
seq_len = inputs.shape[0]
arange = F.arange(seq_len, dtype=inputs.dtype, ctx=inputs.context)
else:
zeros = F.zeros_like(inputs)
arange = F.arange(start=0, repeat=1, step=1, infer_range=True, dtype='int32')
arange = F.elemwise_add(arange, zeros)
return arange

def _decode(self, F, sequence, masked_positions):
"""Generate unnormalized prediction for the masked language model task.
Expand All @@ -510,7 +498,7 @@ def _decode(self, F, sequence, masked_positions):
masked_positions = masked_positions.astype('int32')
mask_shape = masked_positions.shape_array()
num_masked_positions = mask_shape.slice(begin=(1,), end=(2,)).astype('int32')
idx_arange = self._arange_like(F, masked_positions)
idx_arange = F.contrib.arange_like(masked_positions.reshape((-1, )), axis=0)
batch_idx = F.broadcast_div(idx_arange, num_masked_positions)
# batch_idx_1d = [0,0,0,1,1,1,2,2,2...]
# masked_positions_1d = [1,2,4,0,3,4,2,3,5...]
Expand Down
34 changes: 9 additions & 25 deletions src/gluonnlp/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,20 +439,6 @@ def __call__(self, inputs, states=None, valid_length=None):
"""
return super(BaseTransformerEncoder, self).__call__(inputs, states, valid_length)

def _arange_like(self, F, inputs, axis):
"""Helper function to generate indices of a range"""
if F == mx.ndarray:
seq_len = inputs.shape[axis]
arange = F.arange(seq_len, dtype=inputs.dtype, ctx=inputs.context)
else:
input_axis = inputs.slice(begin=(0, 0, 0), end=(1, None, 1)).reshape((-1))
zeros = F.zeros_like(input_axis)
arange = F.arange(start=0, repeat=1, step=1,
infer_range=True, dtype=self._dtype)
arange = F.elemwise_add(arange, zeros)
return arange


def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_weight=None):
# pylint: disable=arguments-differ
"""Encode the inputs given the states and valid sequence length.
Expand Down Expand Up @@ -489,7 +475,7 @@ def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_wei
(batch_size, num_heads, length, length)
"""
steps = self._arange_like(F, inputs, axis=1)
steps = F.contrib.arange_like(inputs, axis=1)
if valid_length is not None:
ones = F.ones_like(steps)
mask = F.broadcast_lesser(F.reshape(steps, shape=(1, -1)),
Expand All @@ -500,6 +486,8 @@ def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_wei
states = [mask]
else:
states.append(mask)
else:
mask = None

if self._scale_embed:
# XXX: input.shape[-1] and self._units are expected to be the same
Expand All @@ -510,11 +498,10 @@ def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_wei
else:
states.append(steps)

if states is not None:
steps = states[-1]
# positional encoding
positional_embed = F.Embedding(steps, position_weight, self._max_length, self._units)
inputs = F.broadcast_add(inputs, F.expand_dims(positional_embed, axis=0))
# positional encoding
positional_embed = F.Embedding(steps, position_weight, self._max_length, self._units)
inputs = F.broadcast_add(inputs, F.expand_dims(positional_embed, axis=0))

if self._dropout:
if self._use_layer_norm_before_dropout:
inputs = self.layer_norm(inputs)
Expand All @@ -525,10 +512,6 @@ def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_wei
else:
inputs = self.layer_norm(inputs)
outputs = inputs
if valid_length is not None:
mask = states[-2]
else:
mask = None

all_encodings_outputs = []
additional_outputs = []
Expand All @@ -544,7 +527,8 @@ def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_wei
if self._output_attention:
additional_outputs.append(attention_weights)

if valid_length is not None:
if valid_length is not None and not self._output_all_encodings:
# if self._output_all_encodings, SequenceMask is already applied above
outputs = F.SequenceMask(outputs, sequence_length=valid_length,
use_sequence_length=True, axis=1)

Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/vocab/subwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def fasttext_hash_asbytes(ngram, encoding='utf-8'):

def _word_to_hashes(self, word):
if word not in self.special_tokens:
word_enc = bytearray((u'<' + word + u'>').encode('utf-8'))
word_enc = bytearray(('<' + word + '>').encode('utf-8'))
hashes = _fasttext_ngram_hashes(
memoryview(word_enc), ns=self._ngrams,
bucket_size=self.num_subwords)
Expand Down
1 change: 1 addition & 0 deletions tests/unittest/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def test_simverb3500():
@flaky(max_runs=2, min_passes=1)
@pytest.mark.serial
@pytest.mark.remote_required
@pytest.mark.skipif(datetime.date.today() < datetime.date(2019, 10, 21), reason='website down')
def test_semeval17task2():
for segment, length in [("trial", 18), ("test", 500)]:
data = nlp.data.SemEval17Task2(segment=segment)
Expand Down

0 comments on commit 02056ea

Please sign in to comment.