From b43e8a253028c9c4a514bb2c3c147600f84e700a Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 3 Jun 2019 12:19:53 +0000
Subject: [PATCH 1/2] Add failing test

---
 tests/unittest/test_vocab_embed.py | 45 ++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
diff --git a/tests/unittest/test_vocab_embed.py b/tests/unittest/test_vocab_embed.py
index 9aff7e5f65..8d9e573a15 100644
--- a/tests/unittest/test_vocab_embed.py
+++ b/tests/unittest/test_vocab_embed.py
@@ -1223,3 +1223,48 @@ def test_bert_vocab_from_sentencepiece():
     for i in range(num_tokens):
         token = _convert_to_unicode(spm.IdToPiece(i))
         assert bert_vocab[token] == i
+
+
+@pytest.mark.parametrize('unknown_token', ['<unk>', None])
+@pytest.mark.parametrize('padding_token', ['<pad>', '<eos>', None])  # padding_token == eos_token
+@pytest.mark.parametrize('eos_token', ['<eos>', None])
+@pytest.mark.parametrize('reserved_tokens', [['<tok>'], []])
+def test_vocab_duplicate_special_tokens(unknown_token, padding_token,
+                                        eos_token, reserved_tokens):
+    """Different special tokens are allowed to map to the same representations.
+
+    Special tokens are a subset of the reserved tokens. In general reserved
+    tokens must not contain duplicates; however, it is allowed that multiple
+    special tokens use the same reserved token.
+
+    """
+    counter = nlp.data.utils.Counter(
+        ['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
+
+    Vocab = functools.partial(nlp.Vocab,
+                              counter,
+                              max_size=None,
+                              min_freq=1,
+                              unknown_token=unknown_token,
+                              padding_token=padding_token,
+                              bos_token=None,
+                              eos_token=eos_token)
+
+    v = Vocab(reserved_tokens=reserved_tokens)
+
+    # Duplicate special tokens must not corrupt the index
+    # (Broken before GluonNLP 0.7)
+    if eos_token is not None and padding_token == eos_token:
+        # padding_token == eos_token; there should only be a single index for
+        # <eos>
+        # Before GluonNLP 0.7, idx_to_token looked like
+        # ['<unk>', '<eos>', '<eos>', 'c', 'b', 'a']
+        # But it should look like
+        # ['<unk>', '<eos>', 'c', 'b', 'a']
+        assert len(v.idx_to_token) == len(v.token_to_idx)
+        assert len(v.idx_to_token) == len(set(v.idx_to_token))
+
+    # Specifying a special tokens as reserved tokens is counted as duplicate
+    if eos_token is not None:
+        with pytest.raises(AssertionError):
+            Vocab(reserved_tokens=reserved_tokens + [eos_token])

From 5ff15cf939d3fdae7799452e3dc17fd439ee9d4e Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 3 Jun 2019 12:23:39 +0000
Subject: [PATCH 2/2] Fix handling of duplicate special tokens in Vocabulary

Example use case in scripts/tests/test_dataprocessor.py where eos_token ==
padding_token. Prior to this fix, eos_token == padding_token lead to a corrupted
vocabulary index.

Example of the corrupted index:

> v = nlp.Vocab(...)
> v.idx_to_token
['<unk>', '<eos>', '<eos>', 'c', 'b', 'a']
> v.token_to_idx
{'<unk>': 0, '<eos>': 2, 'c': 3, 'b': 4, 'a': 5}
---
 src/gluonnlp/vocab/vocab.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py
index a650841e8b..0483f234da 100644
--- a/src/gluonnlp/vocab/vocab.py
+++ b/src/gluonnlp/vocab/vocab.py
@@ -138,13 +138,13 @@ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token=C.UNK_
         self._unknown_token = unknown_token
         special_tokens = []
         self._padding_token = padding_token
-        if padding_token:
+        if padding_token and padding_token not in special_tokens:
             special_tokens.append(padding_token)
         self._bos_token = bos_token
-        if bos_token:
+        if bos_token and bos_token not in special_tokens:
             special_tokens.append(bos_token)
         self._eos_token = eos_token
-        if eos_token:
+        if eos_token and eos_token not in special_tokens:
             special_tokens.append(eos_token)
         if reserved_tokens:
             special_tokens.extend(reserved_tokens)
@@ -170,6 +170,7 @@ def _index_special_tokens(self, unknown_token, special_tokens):
             self._reserved_tokens = None
         else:
             self._reserved_tokens = special_tokens[:]
+            assert len(special_tokens) == len(set(special_tokens))  # sanity check
             self._idx_to_token.extend(special_tokens)
 
         if unknown_token: