diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py index e82a14700c..2256da6ccb 100644 --- a/scripts/tests/test_scripts.py +++ b/scripts/tests/test_scripts.py @@ -69,7 +69,8 @@ def test_glove(): @pytest.mark.gpu @pytest.mark.integration @pytest.mark.parametrize('fasttextloadngrams', [True, False]) -def test_embedding_evaluate_pretrained(fasttextloadngrams): +@pytest.mark.parametrize('maxvocabsize', [None, 50000]) +def test_embedding_evaluate_pretrained(fasttextloadngrams, maxvocabsize): cmd = [ sys.executable, './scripts/word_embeddings/evaluate_pretrained.py', '--embedding-name', 'fasttext', '--embedding-source', 'wiki.simple', @@ -79,6 +80,8 @@ def test_embedding_evaluate_pretrained(fasttextloadngrams): cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet'] if fasttextloadngrams: cmd.append('--fasttext-load-ngrams') + if maxvocabsize: + cmd += ['--analogy-max-vocab-size', str(maxvocabsize)] subprocess.check_call(cmd) time.sleep(5) @@ -98,11 +101,11 @@ def test_embedding_evaluate_from_path(evaluateanalogies, maxvocabsize): sys.executable, './scripts/word_embeddings/evaluate_pretrained.py', '--embedding-path', path, '--gpu', '0'] if evaluateanalogies: - cmd += ['--similarity-datasets='] + cmd += ['--similarity-datasets'] cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet'] else: cmd += ['--similarity-datasets', 'WordSim353'] - cmd += ['--analogy-datasets='] + cmd += ['--analogy-datasets'] if maxvocabsize is not None: cmd += ['--analogy-max-vocab-size', str(maxvocabsize)] subprocess.check_call(cmd) diff --git a/scripts/word_embeddings/evaluate_pretrained.py b/scripts/word_embeddings/evaluate_pretrained.py index b430511445..44cc5fb5ab 100644 --- a/scripts/word_embeddings/evaluate_pretrained.py +++ b/scripts/word_embeddings/evaluate_pretrained.py @@ -140,23 +140,25 @@ def load_embedding_from_path(args): args.embedding_path) idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get) - embedding = nlp.embedding.TokenEmbedding( - unknown_token=None, unknown_lookup=model, allow_extend=True) - # Analogy task is open-vocabulary, so must keep all known words. # But if not evaluating analogy, no need to precompute now as all # words for closed vocabulary task can be obtained via the unknown # lookup if not args.analogy_datasets: - idx_to_token = [] - elif args.analogy_datasets and args.analogy_max_vocab_size: - idx_to_token = idx_to_token[:args.analogy_max_vocab_size] - - embedding[''] = mx.nd.zeros(model.weight.shape[1]) - if idx_to_token: + # TODO(leezu): use shape (0, model.weight.shape[1]) once np shape + # is supported by TokenEmbedding + idx_to_token = [''] + idx_to_vec = mx.nd.zeros((1, model.weight.shape[1])) + else: + if args.analogy_max_vocab_size: + idx_to_token = idx_to_token[:args.analogy_max_vocab_size] with utils.print_time('compute vectors for {} known ' 'words.'.format(len(idx_to_token))): - embedding[idx_to_token] = model[idx_to_token] + idx_to_vec = model[idx_to_token] + + embedding = nlp.embedding.TokenEmbedding( + unknown_token=None, idx_to_token=idx_to_token, + idx_to_vec=idx_to_vec, unknown_lookup=model) else: embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path) @@ -180,12 +182,12 @@ def enforce_max_size(token_embedding, size): if size and len(token_embedding.idx_to_token) > size: assert size > 0 size = size + 1 if token_embedding.unknown_token is not None else size - token_embedding._idx_to_token = token_embedding._idx_to_token[:size] - token_embedding._idx_to_vec = token_embedding._idx_to_vec[:size] - token_embedding._token_to_idx = { - token: idx - for idx, token in enumerate(token_embedding._idx_to_token) - } + token_embedding = nlp.embedding.TokenEmbedding( + unknown_token=token_embedding.unknown_token, + idx_to_token=token_embedding._idx_to_token[:size], + idx_to_vec=token_embedding._idx_to_vec[:size], + unknown_lookup=token_embedding.unknown_lookup) + return token_embedding if __name__ == '__main__': @@ -205,13 +207,18 @@ def enforce_max_size(token_embedding, size): token_embedding_ = load_embedding_from_path(args_) name = '' - enforce_max_size(token_embedding_, args_.analogy_max_vocab_size) + token_embedding_ = enforce_max_size( + token_embedding_, args_.analogy_max_vocab_size) + if args_.fasttext_load_ngrams: + assert token_embedding_.unknown_lookup is not None known_tokens = set(token_embedding_.idx_to_token) if args_.similarity_datasets: with utils.print_time('find relevant tokens for similarity'): tokens = evaluation.get_similarity_task_tokens(args_) - vocab = nlp.Vocab(nlp.data.count_tokens(tokens)) + vocab = nlp.Vocab(nlp.data.count_tokens(tokens), + unknown_token=token_embedding_.unknown_token, + padding_token=None, bos_token=None, eos_token=None) with utils.print_time('set {} embeddings'.format(len(tokens))): vocab.set_embedding(token_embedding_) evaluation.evaluate_similarity( @@ -225,7 +232,9 @@ def enforce_max_size(token_embedding, size): tokens.update(token_embedding_.idx_to_token[1:]) else: tokens.update(token_embedding_.idx_to_token) - vocab = nlp.Vocab(nlp.data.count_tokens(tokens)) + vocab = nlp.Vocab(nlp.data.count_tokens(tokens), + unknown_token=token_embedding_.unknown_token, + padding_token=None, bos_token=None, eos_token=None) with utils.print_time('set {} embeddings'.format(len(tokens))): vocab.set_embedding(token_embedding_) evaluation.evaluate_analogy( diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py index 78e53bbe93..fd387ef549 100644 --- a/src/gluonnlp/vocab/vocab.py +++ b/src/gluonnlp/vocab/vocab.py @@ -400,6 +400,8 @@ def set_embedding(self, *embeddings): 'unknown_token set.' new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings) + # TODO(leezu): Remove once np shape is used by default + assert len(self), 'Empty vocab not yet supported' new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len)) col_start = 0