From 67cfbc863d9d05baa2362a82cb78e0acb569d6ba Mon Sep 17 00:00:00 2001 From: Eric Junyuan Xie Date: Fri, 12 Jan 2018 12:28:34 -0800 Subject: [PATCH] Revert "Add mxnet.text APIs (#8763)" (#9401) This reverts commit 6c1f4f7023104ba936322fa617d8653e6ed4fbfb. --- docs/api/python/index.md | 9 - docs/api/python/text/text.md | 443 ----------------- python/mxnet/registry.py | 17 - python/mxnet/text/__init__.py | 25 - python/mxnet/text/constants.py | 24 - python/mxnet/text/embedding.py | 681 -------------------------- python/mxnet/text/glossary.py | 142 ------ python/mxnet/text/indexer.py | 231 --------- python/mxnet/text/utils.py | 79 --- tests/python/unittest/test_text.py | 743 ----------------------------- 10 files changed, 2394 deletions(-) delete mode 100644 docs/api/python/text/text.md delete mode 100644 python/mxnet/text/__init__.py delete mode 100644 python/mxnet/text/constants.py delete mode 100644 python/mxnet/text/embedding.py delete mode 100644 python/mxnet/text/glossary.py delete mode 100644 python/mxnet/text/indexer.py delete mode 100644 python/mxnet/text/utils.py delete mode 100644 tests/python/unittest/test_text.py diff --git a/docs/api/python/index.md b/docs/api/python/index.md index 7a3ad7c03c64..75ff186fd81d 100644 --- a/docs/api/python/index.md +++ b/docs/api/python/index.md @@ -98,15 +98,6 @@ imported by running: io/io.md ``` -## Text API - -```eval_rst -.. toctree:: - :maxdepth: 1 - - text/text.md -``` - ## Image API ```eval_rst diff --git a/docs/api/python/text/text.md b/docs/api/python/text/text.md deleted file mode 100644 index a448ae4dee83..000000000000 --- a/docs/api/python/text/text.md +++ /dev/null @@ -1,443 +0,0 @@ -# Text API - -## Overview - -The mxnet.text APIs refer to classes and functions related to text data -processing, such as bulding indices and loading pre-trained embedding vectors -for text tokens and storing them in the `mxnet.ndarray.NDArray` format. - -This document lists the text APIs in mxnet: - -```eval_rst -.. autosummary:: - :nosignatures: - - mxnet.text.glossary - mxnet.text.embedding - mxnet.text.indexer - mxnet.text.utils -``` - -All the code demonstrated in this document assumes that the following modules -or packages are imported. - -```python ->>> from mxnet import gluon ->>> from mxnet import nd ->>> from mxnet import text ->>> import collections - -``` - -### Look up pre-trained word embeddings for indexed words - -As a common use case, let us look up pre-trained word embedding vectors for -indexed words in just a few lines of code. To begin with, we can create a -fastText word embedding object by specifying the embedding name `fasttext` and -the pre-trained file `wiki.simple.vec`. - -```python ->>> fasttext_simple = text.embedding.TokenEmbedding.create('fasttext', -... pretrained_file_name='wiki.simple.vec') - -``` - -Suppose that we have a simple text data set in the string format. We can count -word frequency in the data set. - -```python ->>> text_data = " hello world \n hello nice world \n hi world \n" ->>> counter = text.utils.count_tokens_from_str(text_data) - -``` - -The obtained `counter` has key-value pairs whose keys are words and values are -word frequencies. Suppose that we want to build indices for all the keys in -`counter` and load the defined fastText word embedding for all such indexed -words. We can create a glossary object by specifying `counter` and -`fasttext_simple` as its argument. - -```python ->>> glossary = text.glossary.Glossary(counter, fasttext_simple) - -``` - -Now we are ready to look up the fastText word embedding vectors for indexed -words. - -```python ->>> glossary.get_vecs_by_tokens(['hello', 'world']) - -[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 - ... - -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] - [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 - ... - -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] - - -``` - -### Use `glossary` in `gluon` - -To demonstrate how to use a glossary with the loaded word embedding in the -`gluon` package, let us first obtain indices of the words 'hello' and 'world'. - -```python ->>> glossary.to_indices(['hello', 'world']) -[2, 1] - -``` - -We can obtain the vector representation for the words 'hello' and 'world' -by specifying their indices (2 and 1) and the `glossary.idx_to_vec` in -`mxnet.gluon.nn.Embedding`. - -```python ->>> layer = gluon.nn.Embedding(len(glossary), glossary.vec_len) ->>> layer.initialize() ->>> layer.weight.set_data(glossary.idx_to_vec) ->>> layer(nd.array([2, 1])) - -[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 - ... - -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] - [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 - ... - -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] - - -``` - - -## Glossary - -The glossary provides indexing and embedding for text tokens in a glossary. For -each indexed token in a glossary, an embedding vector will be associated with -it. Such embedding vectors can be loaded from externally hosted or custom -pre-trained token embedding files, such as via instances of -[`TokenEmbedding`](#mxnet.text.embedding.TokenEmbedding). -The input counter whose keys are -candidate indices may be obtained via -[`count_tokens_from_str`](#mxnet.text.utils.count_tokens_from_str). - -```eval_rst -.. currentmodule:: mxnet.text.glossary -.. autosummary:: - :nosignatures: - - Glossary -``` - -To get all the valid names for pre-trained embeddings and files, we can use -[`TokenEmbedding.get_embedding_and_pretrained_file_names`](#mxnet.text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names). - -```python ->>> text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names() -{'glove': ['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', -'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.840B.300d.txt', -'glove.twitter.27B.25d.txt', 'glove.twitter.27B.50d.txt', -'glove.twitter.27B.100d.txt', 'glove.twitter.27B.200d.txt'], -'fasttext': ['wiki.en.vec', 'wiki.simple.vec', 'wiki.zh.vec']} - -``` - -To begin with, we can create a fastText word embedding object by specifying the -embedding name `fasttext` and the pre-trained file `wiki.simple.vec`. - -```python ->>> fasttext_simple = text.embedding.TokenEmbedding.create('fasttext', -... pretrained_file_name='wiki.simple.vec') - -``` - -Suppose that we have a simple text data set in the string format. We can count -word frequency in the data set. - -```python ->>> text_data = " hello world \n hello nice world \n hi world \n" ->>> counter = text.utils.count_tokens_from_str(text_data) - -``` - -The obtained `counter` has key-value pairs whose keys are words and values are -word frequencies. Suppose that we want to build indices for the most frequent 2 -keys in `counter` and load the defined fastText word embedding for all these -2 words. - -```python ->>> glossary = text.glossary.Glossary(counter, fasttext_simple, most_freq_count=2) - -``` - -Now we are ready to look up the fastText word embedding vectors for indexed -words. - -```python ->>> glossary.get_vecs_by_tokens(['hello', 'world']) - -[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 - ... - -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] - [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 - ... - -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] - - -``` - -We can also access properties such as `token_to_idx` (mapping tokens to -indices), `idx_to_token` (mapping indices to tokens), and `vec_len` -(length of each embedding vector). - -```python ->>> glossary.token_to_idx -{'': 0, 'world': 1, 'hello': 2, 'hi': 3, 'nice': 4} ->>> glossary.idx_to_token -['', 'world', 'hello', 'hi', 'nice'] ->>> len(glossary) -5 ->>> glossary.vec_len -300 - -``` - -If a token is unknown to `glossary`, its embedding vector is initialized -according to the default specification in `fasttext_simple` (all elements are -0). - -```python - ->>> glossary.get_vecs_by_tokens('unknownT0kEN') - -[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - - -``` - -## Text token embedding - -The text token embedding builds indices for text tokens. Such indexed tokens can -be used by instances of [`TokenEmbedding`](#mxnet.text.embedding.TokenEmbedding) -and [`Glossary`](#mxnet.text.glossary.Glossary). - -To load token embeddings from an externally hosted pre-trained token embedding -file, such as those of GloVe and FastText, use -[`TokenEmbedding.create(embedding_name, pretrained_file_name)`](#mxnet.text.embedding.TokenEmbedding.create). -To get all the available `embedding_name` and `pretrained_file_name`, use -[`TokenEmbedding.get_embedding_and_pretrained_file_names()`](#mxnet.text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names). - -Alternatively, to load embedding vectors from a custom pre-trained text token -embedding file, use [`CustomEmbedding`](#mxnet.text.embedding.CustomEmbedding). - - -```eval_rst -.. currentmodule:: mxnet.text.embedding -.. autosummary:: - :nosignatures: - - TokenEmbedding - GloVe - FastText - CustomEmbedding -``` - -To get all the valid names for pre-trained embeddings and files, we can use -[`TokenEmbedding.get_embedding_and_pretrained_file_names`](#mxnet.text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names). - -```python ->>> text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names() -{'glove': ['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', -'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.840B.300d.txt', -'glove.twitter.27B.25d.txt', 'glove.twitter.27B.50d.txt', -'glove.twitter.27B.100d.txt', 'glove.twitter.27B.200d.txt'], -'fasttext': ['wiki.en.vec', 'wiki.simple.vec', 'wiki.zh.vec']} - -``` - -To begin with, we can create a GloVe word embedding object by specifying the -embedding name `glove` and the pre-trained file `glove.6B.50d.txt`. The -argument `init_unknown_vec` specifies default vector representation for any -unknown token. - -```python ->>> glove_6b_50d = text.embedding.TokenEmbedding.create('glove', -... pretrained_file_name='glove.6B.50d.txt', init_unknown_vec=nd.zeros) - -``` - -We can access properties such as `token_to_idx` (mapping tokens to indices), -`idx_to_token` (mapping indices to tokens), `vec_len` (length of each embedding -vector), and `unknown_token` (representation of any unknown token, default -value is ''). - -```python ->>> glove_6b_50d.token_to_idx['hi'] -11084 ->>> glove_6b_50d.idx_to_token[11084] -'hi' ->>> glove_6b_50d.vec_len -50 ->>> glove_6b_50d.unknown_token -'' - -``` - -For every unknown token, if its representation '' is encountered in the -pre-trained token embedding file, index 0 of property `idx_to_vec` maps to the -pre-trained token embedding vector loaded from the file; otherwise, index 0 of -property `idx_to_vec` maps to the default token embedding vector specified via -`init_unknown_vec` (set to nd.zeros here). Since the pre-trained file -does not have a vector for the token '', index 0 has to map to an -additional token '' and the number of tokens in the embedding is 400,001. - - -```python ->>> len(glove_6b_50d) -400001 ->>> glove_6b_50d.idx_to_vec[0] - -[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - ->>> glove_6b_50d.get_vecs_by_tokens('unknownT0kEN') - -[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - ->>> glove_6b_50d.get_vecs_by_tokens(['unknownT0kEN', 'unknownT0kEN']) - -[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] - - -``` - - -### Implement a new text token embedding - -For ``optimizer``, create a subclass of -[`TokenEmbedding`](#mxnet.text.embedding.TokenEmbedding). -Also add ``@TokenEmbedding.register`` before this class. See -[`embedding.py`](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/text/embedding.py) -for examples. - - -## Text token indexer - -The text token indexer builds indices for text tokens. Such indexed tokens can -be used by instances of [`TokenEmbedding`](#mxnet.text.embedding.TokenEmbedding) -and [`Glossary`](#mxnet.text.glossary.Glossary). The input -counter whose keys are candidate indices may be obtained via -[`count_tokens_from_str`](#mxnet.text.utils.count_tokens_from_str). - - -```eval_rst -.. currentmodule:: mxnet.text.indexer -.. autosummary:: - :nosignatures: - - TokenIndexer -``` - -Suppose that we have a simple text data set in the string format. We can count -word frequency in the data set. - -```python ->>> text_data = " hello world \n hello nice world \n hi world \n" ->>> counter = text.utils.count_tokens_from_str(text_data) - -``` - -The obtained `counter` has key-value pairs whose keys are words and values are -word frequencies. Suppose that we want to build indices for the 2 most frequent -keys in `counter` with the unknown token representation '' and a reserved -token ''. - -```python ->>> token_indexer = text.indexer.TokenIndexer(counter, most_freq_count=2, -... unknown_token='', reserved_tokens=['']) - -``` - -We can access properties such as `token_to_idx` (mapping tokens to indices), -`idx_to_token` (mapping indices to tokens), `vec_len` (length of each embedding -vector), and `unknown_token` (representation of any unknown token) and -`reserved_tokens`. - -```python ->>> token_indexer = text.indexer.TokenIndexer(counter, most_freq_count=2, -... unknown_token='', reserved_tokens=['']) - -``` - -```python ->>> token_indexer.token_to_idx -{'': 0, '': 1, 'world': 2, 'hello': 3} ->>> token_indexer.idx_to_token -['', '', 'world', 'hello'] ->>> token_indexer.unknown_token -'' ->>> token_indexer.reserved_tokens -[''] ->>> len(token_indexer) -4 -``` - -Besides the specified unknown token '' and reserved_token '' are -indexed, the 2 most frequent words 'world' and 'hello' are also indexed. - - - -## Text utilities - -The following functions provide utilities for text data processing. - -```eval_rst -.. currentmodule:: mxnet.text.utils -.. autosummary:: - :nosignatures: - - count_tokens_from_str -``` - - - - -## API Reference - - - -```eval_rst - -.. automodule:: mxnet.text.glossary -.. autoclass:: mxnet.text.glossary.Glossary - :members: get_vecs_by_tokens, update_token_vectors, to_indices, to_tokens - -.. automodule:: mxnet.text.embedding -.. autoclass:: mxnet.text.embedding.TokenEmbedding - :members: get_vecs_by_tokens, update_token_vectors, to_indices, to_tokens, register, create, get_embedding_and_pretrained_file_names -.. autoclass:: mxnet.text.embedding.GloVe - :members: get_vecs_by_tokens, update_token_vectors, to_indices, to_tokens -.. autoclass:: mxnet.text.embedding.FastText - :members: get_vecs_by_tokens, update_token_vectors, to_indices, to_tokens -.. autoclass:: mxnet.text.embedding.CustomEmbedding - :members: get_vecs_by_tokens, update_token_vectors, to_indices, to_tokens - -.. automodule:: mxnet.text.indexer -.. autoclass:: mxnet.text.indexer.TokenIndexer - :members: to_indices, to_tokens - -.. automodule:: mxnet.text.utils - :members: count_tokens_from_str - -``` - diff --git a/python/mxnet/registry.py b/python/mxnet/registry.py index eaae9208b5b7..4a4f22fa142b 100644 --- a/python/mxnet/registry.py +++ b/python/mxnet/registry.py @@ -29,23 +29,6 @@ _REGISTRY = {} -def get_registry(base_class): - """Get registrator. - - Parameters - ---------- - base_class : type - base class for classes that will be registered - - Returns - ------- - a registrator - """ - if base_class not in _REGISTRY: - _REGISTRY[base_class] = {} - return _REGISTRY[base_class].copy() - - def get_register_func(base_class, nickname): """Get registrator function. diff --git a/python/mxnet/text/__init__.py b/python/mxnet/text/__init__.py deleted file mode 100644 index 16035b7f37b8..000000000000 --- a/python/mxnet/text/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 -"""Text utilities.""" - -from . import utils -from . import constants -from . import indexer -from . import embedding -from . import glossary diff --git a/python/mxnet/text/constants.py b/python/mxnet/text/constants.py deleted file mode 100644 index a36d5af70385..000000000000 --- a/python/mxnet/text/constants.py +++ /dev/null @@ -1,24 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 - -"""Read text files and load embeddings.""" -from __future__ import absolute_import -from __future__ import print_function - -UNKNOWN_IDX = 0 diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py deleted file mode 100644 index 5b45e580140c..000000000000 --- a/python/mxnet/text/embedding.py +++ /dev/null @@ -1,681 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 -# pylint: disable=consider-iterating-dictionary - -"""Text token embeddings.""" -from __future__ import absolute_import -from __future__ import print_function - -import io -import logging -import os -import tarfile -import warnings -import zipfile - -from . import constants as C -from ..gluon.utils import download -from .indexer import TokenIndexer -from .. import ndarray as nd -from .. import registry - - -class TokenEmbedding(TokenIndexer): - """Token embedding base class. - - - To load token embeddings from an externally hosted pre-trained - token embedding file, such as those of GloVe and FastText, use - `TokenEmbedding.create(embedding_name, pretrained_file_name)`. To get all - the available `embedding_name` and `pretrained_file_name`, use - `TokenEmbedding.get_embedding_and_pretrained_file_names()`. - - Alternatively, to load embedding vectors from a custom pre-trained token - embedding file, use :class:`~mxnet.text.embedding.CustomEmbedding`. - - For every unknown token, if its representation `self.unknown_token` is - encountered in the pre-trained token embedding file, index 0 of - `self.idx_to_vec` maps to the pre-trained token embedding vector loaded from - the file; otherwise, index 0 of `self.idx_to_vec` maps to the token - embedding vector initialized by `init_unknown_vec`. - - If a token is encountered multiple times in the pre-trained token embedding - file, only the first-encountered token embedding vector will be loaded and - the rest will be skipped. - - For the same token, its index and embedding vector may vary across different - instances of :class:`~mxnet.text.embedding.TokenEmbedding`. - - - Properties - ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices - are aligned. - unknown_token : hashable object - The representation for any unknown token. In other words, any - unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - vec_len : int - The length of the embedding vector for each token. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each - token's index to an embedding vector. The largest valid index maps - to the initialized embedding vector for every reserved token, such as an - unknown_token token and a padding token. - """ - - def __init__(self, **kwargs): - super(TokenEmbedding, self).__init__(**kwargs) - - @classmethod - def _get_pretrained_file_path_from_url(cls, url, embedding_root, - pretrained_file_name): - """Get the local path to the pre-trained token embedding file from url. - - - The pre-trained embedding file will be downloaded from url if it has not - been downloaded yet or the existing file fails to match its expected - SHA-1 hash. - """ - - embedding_cls = cls.__name__.lower() - embedding_root = os.path.expanduser(embedding_root) - - embedding_dir = os.path.join(embedding_root, embedding_cls) - pretrained_file_path = os.path.join(embedding_dir, pretrained_file_name) - downloaded_file = os.path.basename(url) - downloaded_file_path = os.path.join(embedding_dir, downloaded_file) - - expected_file_hash = cls.pretrained_file_name_sha1[pretrained_file_name] - - if hasattr(cls, 'pretrained_archive_name_sha1'): - expected_downloaded_hash = \ - cls.pretrained_archive_name_sha1[downloaded_file] - else: - expected_downloaded_hash = expected_file_hash - - # If downloaded_file_path exists and matches expected_downloaded_hash, - # there is no need to download. - download(url, downloaded_file_path, sha1_hash=expected_downloaded_hash) - - ext = os.path.splitext(downloaded_file)[1] - if ext == '.zip': - with zipfile.ZipFile(downloaded_file_path, 'r') as zf: - zf.extractall(embedding_dir) - elif ext == '.gz': - with tarfile.open(downloaded_file_path, 'r:gz') as tar: - tar.extractall(path=embedding_dir) - return pretrained_file_path - - def _load_embedding(self, pretrained_file_path, elem_delim, - init_unknown_vec, encoding='utf8'): - """Load embedding vectors from the pre-trained token embedding file. - - - For every unknown token, if its representation `self.unknown_token` is - encountered in the pre-trained token embedding file, index 0 of - `self.idx_to_vec` maps to the pre-trained token embedding vector loaded - from the file; otherwise, index 0 of `self.idx_to_vec` maps to the text - embedding vector initialized by `init_unknown_vec`. - - If a token is encountered multiple times in the pre-trained text - embedding file, only the first-encountered token embedding vector will - be loaded and the rest will be skipped. - """ - - pretrained_file_path = os.path.expanduser(pretrained_file_path) - - if not os.path.isfile(pretrained_file_path): - raise ValueError('`pretrained_file_path` must be a valid path to ' - 'the pre-trained token embedding file.') - - with io.open(pretrained_file_path, 'r', encoding=encoding) as f: - lines = f.readlines() - - logging.info('Loading pre-trained token embedding vectors from %s', - pretrained_file_path) - - vec_len = None - all_elems = [] - tokens = set() - loaded_unknown_vec = None - line_num = 0 - for line in lines: - line_num += 1 - elems = line.rstrip().split(elem_delim) - - assert len(elems) > 1, 'At line %d of the pre-trained text ' \ - 'embedding file: the data format of the ' \ - 'pre-trained token embedding file %s is ' \ - 'unexpected.' \ - % (line_num, pretrained_file_path) - - token, elems = elems[0], [float(i) for i in elems[1:]] - - if token == self.unknown_token and loaded_unknown_vec is None: - loaded_unknown_vec = elems - tokens.add(self.unknown_token) - elif token in tokens: - warnings.warn('At line %d of the pre-trained token embedding ' - 'file: the embedding vector for token %s has ' - 'been loaded and a duplicate embedding for the ' - 'same token is seen and skipped.' - % (line_num, token)) - elif len(elems) == 1: - warnings.warn('At line %d of the pre-trained text ' - 'embedding file: token %s with 1-dimensional ' - 'vector %s is likely a header and is ' - 'skipped.' % (line_num, token, elems)) - else: - if vec_len is None: - vec_len = len(elems) - # Reserve a vector slot for the unknown token at the - # very beggining because the unknown index is 0. - all_elems.extend([0] * vec_len) - else: - assert len(elems) == vec_len, \ - 'At line %d of the pre-trained token embedding ' \ - 'file: the dimension of token %s is %d but the ' \ - 'dimension of previous tokens is %d. Dimensions ' \ - 'of all the tokens must be the same.' \ - % (line_num, token, len(elems), vec_len) - all_elems.extend(elems) - self._idx_to_token.append(token) - self._token_to_idx[token] = len(self._idx_to_token) - 1 - tokens.add(token) - - self._vec_len = vec_len - self._idx_to_vec = nd.array(all_elems).reshape((-1, self.vec_len)) - - if loaded_unknown_vec is None: - self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec( - shape=self.vec_len) - else: - self._idx_to_vec[C.UNKNOWN_IDX] = nd.array(loaded_unknown_vec) - - @property - def vec_len(self): - return self._vec_len - - @property - def idx_to_vec(self): - return self._idx_to_vec - - def get_vecs_by_tokens(self, tokens, lower_case_backup=False): - """Look up embedding vectors of tokens. - - - Parameters - ---------- - tokens : str or list of strs - A token or a list of tokens. - lower_case_backup : bool, default False - If False, each token in the original case will be looked up; if - True, each token in the original case will be looked up first, if - not found in the keys of the property `token_to_idx`, the token - in the lower case will be looked up. - - - Returns - ------- - mxnet.ndarray.NDArray: - The embedding vector(s) of the token(s). According to numpy - conventions, if `tokens` is a string, returns a 1-D NDArray of shape - `self.vec_len`; if `tokens` is a list of strings, returns a 2-D - NDArray of shape=(len(tokens), self.vec_len). - """ - - to_reduce = False - if not isinstance(tokens, list): - tokens = [tokens] - to_reduce = True - - if not lower_case_backup: - indices = [self.token_to_idx.get(token, C.UNKNOWN_IDX) - for token in tokens] - else: - indices = [self.token_to_idx[token] if token in self.token_to_idx - else self.token_to_idx.get(token.lower(), C.UNKNOWN_IDX) - for token in tokens] - - vecs = nd.Embedding(nd.array(indices), self.idx_to_vec, - self.idx_to_vec.shape[0], self.idx_to_vec.shape[1]) - - return vecs[0] if to_reduce else vecs - - def update_token_vectors(self, tokens, new_vectors): - """Updates embedding vectors for tokens. - - - Parameters - ---------- - tokens : str or a list of strs - A token or a list of tokens whose embedding vector are to be - updated. - new_vectors : mxnet.ndarray.NDArray - An NDArray to be assigned to the embedding vectors of `tokens`. - Its length must be equal to the number of `tokens` and its width - must be equal to the dimension of embeddings of the glossary. If - `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a - list of multiple strings, it must be 2-D. - """ - - assert self.idx_to_vec is not None, \ - 'The property `idx_to_vec` has not been properly set.' - - if not isinstance(tokens, list) or len(tokens) == 1: - assert isinstance(new_vectors, nd.NDArray) and \ - len(new_vectors.shape) in [1, 2], \ - '`new_vectors` must be a 1-D or 2-D NDArray if `tokens` is a ' \ - 'singleton.' - if not isinstance(tokens, list): - tokens = [tokens] - if len(new_vectors.shape) == 1: - new_vectors = new_vectors.expand_dims(0) - - else: - assert isinstance(new_vectors, nd.NDArray) and \ - len(new_vectors.shape) == 2, \ - '`new_vectors` must be a 2-D NDArray if `tokens` is a list ' \ - 'of multiple strings.' - assert new_vectors.shape == (len(tokens), self.vec_len), \ - 'The length of new_vectors must be equal to the number of tokens ' \ - 'and the width of new_vectors must be equal to the dimension of ' \ - 'embeddings of the glossary.' - - indices = [] - for token in tokens: - if token in self.token_to_idx: - indices.append(self.token_to_idx[token]) - else: - raise ValueError('Token %s is unknown. To update the embedding ' - 'vector for an unknown token, please specify ' - 'it explicitly as the `unknown_token` %s in ' - '`tokens`. This is to avoid unintended ' - 'updates.' % - (token, self.idx_to_token[C.UNKNOWN_IDX])) - - self._idx_to_vec[nd.array(indices)] = new_vectors - - @staticmethod - def register(embedding_cls): - """Registers a new token embedding. - - - Once an embedding is registered, we can create an instance of this - embedding with :func:`~mxnet.text.embedding.TokenEmbedding.create`. - - - Examples - -------- - >>> @mxnet.text.embedding.TokenEmbedding.register - ... class MyTextEmbed(mxnet.text.embedding.TokenEmbedding): - ... def __init__(self, pretrained_file_name='my_pretrain_file'): - ... pass - >>> embed = mxnet.text.embedding.TokenEmbedding.create('MyTokenEmbed') - >>> print(type(embed)) - - """ - - register_text_embedding = registry.get_register_func( - TokenEmbedding, 'token embedding') - return register_text_embedding(embedding_cls) - - @staticmethod - def create(embedding_name, **kwargs): - """Creates an instance of :class:`~mxnet.text.embedding.TokenEmbedding`. - - - Creates a token embedding instance by loading embedding vectors from an - externally hosted pre-trained token embedding file, such as those - of GloVe and FastText. To get all the valid `embedding_name` and - `pretrained_file_name`, use `mxnet.text.embedding.TokenEmbedding. - get_embedding_and_pretrained_file_names()`. - - - Parameters - ---------- - embedding_name : str - The token embedding name (case-insensitive). - - - Returns - ------- - :class:`~mxnet.text.glossary.TokenEmbedding`: - A token embedding instance that loads embedding vectors from an - externally hosted pre-trained token embedding file. - """ - - create_text_embedding = registry.get_create_func( - TokenEmbedding, 'token embedding') - return create_text_embedding(embedding_name, **kwargs) - - @classmethod - def _check_pretrained_file_names(cls, pretrained_file_name): - """Checks if a pre-trained token embedding file name is valid. - - - Parameters - ---------- - pretrained_file_name : str - The pre-trained token embedding file. - """ - - embedding_name = cls.__name__.lower() - if pretrained_file_name not in cls.pretrained_file_name_sha1: - raise KeyError('Cannot find pretrained file %s for token embedding ' - '%s. Valid pretrained files for embedding %s: %s' % - (pretrained_file_name, embedding_name, - embedding_name, - ', '.join(cls.pretrained_file_name_sha1.keys()))) - - @staticmethod - def get_embedding_and_pretrained_file_names(embedding_name=None): - """Get valid token embedding names and their pre-trained file names. - - - To load token embedding vectors from an externally hosted pre-trained - token embedding file, such as those of GloVe and FastText, one should - use `mxnet.text.embedding.TokenEmbedding.create(embedding_name, - pretrained_file_name)`. This method returns all the valid names of - `pretrained_file_name` for the specified `embedding_name`. If - `embedding_name` is set to None, this method returns all the valid names - of `embedding_name` with associated `pretrained_file_name`. - - - Parameters - ---------- - embedding_name : str or None, default None - The pre-trained token embedding name. - - - Returns - ------- - dict or list: - A list of all the valid pre-trained token embedding file names - (`pretrained_file_name`) for the specified token embedding name - (`embedding_name`). If the text embeding name is set to None, - returns a dict mapping each valid token embedding name to a list - of valid pre-trained files (`pretrained_file_name`). They can be - plugged into `mxnet.text.embedding.TokenEmbedding.create( - embedding_name, pretrained_file_name)`. - """ - - text_embedding_reg = registry.get_registry(TokenEmbedding) - - if embedding_name is not None: - if embedding_name not in text_embedding_reg: - raise KeyError('Cannot find `embedding_name` %s. Use ' - '`get_embedding_and_pretrained_file_names(' - 'embedding_name=None).keys()` to get all the ' - 'valid embedding names.' % embedding_name) - return list(text_embedding_reg[ - embedding_name].pretrained_file_name_sha1.keys()) - else: - return {embedding_name: list( - embedding_cls.pretrained_file_name_sha1.keys()) - for embedding_name, embedding_cls in - registry.get_registry(TokenEmbedding).items()} - - -@TokenEmbedding.register -class GloVe(TokenEmbedding): - """The GloVe word embedding. - - - GloVe is an unsupervised learning algorithm for obtaining vector - representations for words. Training is performed on aggregated global - word-word co-occurrence statistics from a corpus, and the resulting - representations showcase interesting linear substructures of the word vector - space. (Source from https://nlp.stanford.edu/projects/glove/) - - Reference: - - GloVe: Global Vectors for Word Representation. - Jeffrey Pennington, Richard Socher, and Christopher D. Manning. - https://nlp.stanford.edu/pubs/glove.pdf - - Website: - - https://nlp.stanford.edu/projects/glove/ - - To get the updated URLs to the externally hosted pre-trained token embedding - files, visit https://nlp.stanford.edu/projects/glove/ - - - Parameters - ---------- - pretrain_file : str, default 'glove.840B.300d.txt' - The name of the pre-trained token embedding file. - embed_root : str, default os.path.join('~', '.mxnet', 'embeddings') - The root directory for storing embedding-related files. - unknown_vec : callback - The callback used to initialize the embedding vector for the unknown - token. - - - Properties - ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices - are aligned. - unknown_token : hashable object - The representation for any unknown token. In other words, any - unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - vec_len : int - The length of the embedding vector for each token. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each - token's index to an embedding vector. The largest valid index maps - to the initialized embedding vector for every reserved token, such as an - unknown_token token and a padding token. - """ - - # Map a pre-trained token embedding archive file and its SHA-1 hash. - pretrained_archive_name_sha1 = \ - {'glove.42B.300d.zip': 'f8e722b39578f776927465b71b231bae2ae8776a', - 'glove.6B.zip': 'b64e54f1877d2f735bdd000c1d7d771e25c7dfdc', - 'glove.840B.300d.zip': '8084fbacc2dee3b1fd1ca4cc534cbfff3519ed0d', - 'glove.twitter.27B.zip': 'dce69c404025a8312c323197347695e81fd529fc'} - - # Map a pre-trained token embedding file and its SHA-1 hash. - pretrained_file_name_sha1 = \ - {'glove.42B.300d.txt': '876767977d6bd4d947c0f84d44510677bc94612a', - 'glove.6B.50d.txt': '21bf566a9d27f84d253e0cd4d4be9dcc07976a6d', - 'glove.6B.100d.txt': '16b1dbfaf35476790bd9df40c83e2dfbd05312f1', - 'glove.6B.200d.txt': '17d0355ddaa253e298ede39877d1be70f99d9148', - 'glove.6B.300d.txt': '646443dd885090927f8215ecf7a677e9f703858d', - 'glove.840B.300d.txt': '294b9f37fa64cce31f9ebb409c266fc379527708', - 'glove.twitter.27B.25d.txt': - '767d80889d8c8a22ae7cd25e09d0650a6ff0a502', - 'glove.twitter.27B.50d.txt': - '9585f4be97e286339bf0112d0d3aa7c15a3e864d', - 'glove.twitter.27B.100d.txt': - '1bbeab8323c72332bd46ada0fc3c99f2faaa8ca8', - 'glove.twitter.27B.200d.txt': - '7921c77a53aa5977b1d9ce3a7c4430cbd9d1207a'} - - url_prefix = 'http://nlp.stanford.edu/data/' - - def __init__(self, pretrained_file_name='glove.840B.300d.txt', - embedding_root=os.path.join('~', '.mxnet', 'embeddings'), - init_unknown_vec=nd.zeros, **kwargs): - GloVe._check_pretrained_file_names(pretrained_file_name) - src_archive = {archive.split('.')[1]: archive for archive in - GloVe.pretrained_archive_name_sha1.keys()} - archive = src_archive[pretrained_file_name.split('.')[1]] - url = GloVe.url_prefix + archive - - super(GloVe, self).__init__(**kwargs) - - pretrained_file_path = GloVe._get_pretrained_file_path_from_url( - url, embedding_root, pretrained_file_name) - - self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) - - -@TokenEmbedding.register -class FastText(TokenEmbedding): - """The fastText word embedding. - - - FastText is an open-source, free, lightweight library that allows users to - learn text representations and text classifiers. It works on standard, - generic hardware. Models can later be reduced in size to even fit on mobile - devices. (Source from https://fasttext.cc/) - - References: - - Enriching Word Vectors with Subword Information. - Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. - https://arxiv.org/abs/1607.04606 - - Bag of Tricks for Efficient Text Classification. - Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov. - https://arxiv.org/abs/1607.01759 - - FastText.zip: Compressing text classification models. - Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, - and Tomas Mikolov. - https://arxiv.org/abs/1612.03651 - - Website: - - https://fasttext.cc/ - - To get the updated URLs to the externally hosted pre-trained token embedding - files, visit - https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md - - - Parameters - ---------- - pretrain_file : str, default 'wiki.en.vec' - The name of the pre-trained token embedding file. - embed_root : str, default os.path.join('~', '.mxnet', 'embeddings') - The root directory for storing embedding-related files. - unknown_vec : callback - The callback used to initialize the embedding vector for the unknown - token. - - - Properties - ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices - are aligned. - unknown_token : hashable object - The representation for any unknown token. In other words, any - unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - vec_len : int - The length of the embedding vector for each token. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each - token's index to an embedding vector. The largest valid index maps - to the initialized embedding vector for every reserved token, such as an - unknown_token token and a padding token. - """ - - # Map a pre-trained token embedding file and its SHA-1 hash. - pretrained_file_name_sha1 = \ - {'wiki.en.vec': 'c1e418f144ceb332b4328d27addf508731fa87df', - 'wiki.simple.vec': '55267c50fbdf4e4ae0fbbda5c73830a379d68795', - 'wiki.zh.vec': '117ab34faa80e381641fbabf3a24bc8cfba44050'} - url_prefix = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/' - - def __init__(self, pretrained_file_name='wiki.en.vec', - embedding_root=os.path.join('~', '.mxnet', 'embeddings'), - init_unknown_vec=nd.zeros, **kwargs): - FastText._check_pretrained_file_names(pretrained_file_name) - url = FastText.url_prefix + pretrained_file_name - - super(FastText, self).__init__(**kwargs) - - pretrained_file_path = FastText._get_pretrained_file_path_from_url( - url, embedding_root, pretrained_file_name) - - self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) - - -class CustomEmbedding(TokenEmbedding): - """User-defined token embedding. - - This is to load embedding vectors from a user-defined pre-trained text - embedding file. - - Denote by '' the argument `elem_delim`. Denote by the j-th - element of the token embedding vector for , the expected format of - a custom pre-trained token embedding file is: - - '...\\\\n - ...\\\\n...' - - where k is the length of the embedding vector `vec_len`. - - - Parameters - ---------- - pretrain_file_path : str - The path to the custom pre-trained token embedding file. - elem_delim : str, default ' ' - The delimiter for splitting a token and every embedding vector element - value on the same line of the custom pre-trained token embedding file. - unknown_vec : callback - The callback used to initialize the embedding vector for the unknown - token. - - - Properties - ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices - are aligned. - unknown_token : hashable object - The representation for any unknown token. In other words, any - unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - vec_len : int - The length of the embedding vector for each token. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each - token's index to an embedding vector. The largest valid index maps - to the initialized embedding vector for every reserved token, such as an - unknown_token token and a padding token. - """ - - def __init__(self, pretrained_file_path, elem_delim=' ', encoding='utf8', - init_unknown_vec=nd.zeros, **kwargs): - super(CustomEmbedding, self).__init__(**kwargs) - self._load_embedding(pretrained_file_path, elem_delim, init_unknown_vec, - encoding) diff --git a/python/mxnet/text/glossary.py b/python/mxnet/text/glossary.py deleted file mode 100644 index 941732e67918..000000000000 --- a/python/mxnet/text/glossary.py +++ /dev/null @@ -1,142 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 - -"""Index text tokens and load their embeddings.""" -from __future__ import absolute_import -from __future__ import print_function - -from .. import ndarray as nd -from .embedding import TokenEmbedding - - -class Glossary(TokenEmbedding): - """Indexing and embedding for text tokens in a glossary. - - - For each indexed token in a glossary, an embedding vector will be associated - with it. Such embedding vectors can be loaded from externally hosted or - custom pre-trained token embedding files, such as via instances of - :class:`~mxnet.text.embedding.TokenEmbedding`. - - - Parameters - ---------- - counter : collections.Counter or None, default None - Counts text token frequencies in the text data. Its keys will be indexed - according to frequency thresholds such as `most_freq_count` and - `min_freq`. Keys of `counter`, `unknown_token`, and values of - `reserved_tokens` must be of the same hashable type. Examples: str, int, - and tuple. - token_embeddings : instance or list of :class:`~TokenEmbedding` - One or multiple pre-trained token embeddings to load. If it is a list of - multiple embeddings, these embedding vectors will be concatenated for - each token. - most_freq_count : None or int, default None - The maximum possible number of the most frequent tokens in the keys of - `counter` that can be indexed. Note that this argument does not count - any token from `reserved_tokens`. If this argument is None or larger - than its largest possible value restricted by `counter` and - `reserved_tokens`, this argument becomes positive infinity. - min_freq : int, default 1 - The minimum frequency required for a token in the keys of `counter` to - be indexed. - unknown_token : hashable object, default '' - The representation for any unknown token. In other words, any unknown - token will be indexed as the same representation. Keys of `counter`, - `unknown_token`, and values of `reserved_tokens` must be of the same - hashable type. Examples: str, int, and tuple. - reserved_tokens : list of hashable objects or None, default None - A list of reserved tokens that will always be indexed, such as special - symbols representing padding, beginning of sentence, and end of - sentence. It cannot contain `unknown_token`, or duplicate reserved - tokens. Keys of `counter`, `unknown_token`, and values of - `reserved_tokens` must be of the same hashable type. Examples: str, int, - and tuple. - - - Properties - ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices - are aligned. - unknown_token : hashable object - The representation for any unknown token. In other words, any - unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - vec_len : int - The length of the embedding vector for each token. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each - token's index to an embedding vector. The largest valid index maps - to the initialized embedding vector for every reserved token, such as an - unknown_token token and a padding token. - """ - def __init__(self, counter, token_embeddings, most_freq_count=None, - min_freq=1, unknown_token='', reserved_tokens=None): - - if not isinstance(token_embeddings, list): - token_embeddings = [token_embeddings] - - # Sanity checks. - for embed in token_embeddings: - assert isinstance(embed, TokenEmbedding), \ - 'The parameter `token_embeddings` must be an instance or a ' \ - 'list of instances of `mxnet.text.embedding.TextEmbed` ' \ - 'whose embedding vectors will be loaded or ' \ - 'concatenated-then-loaded to map to the indexed tokens.' - - # Index tokens from keys of `counter` and reserved tokens. - super(Glossary, self).__init__(counter=counter, - most_freq_count=most_freq_count, - min_freq=min_freq, - unknown_token=unknown_token, - reserved_tokens=reserved_tokens) - - # Set _idx_to_vec so that indices of tokens from keys of `counter` are - # associated with token embedding vectors from `token_embeddings`. - self._set_idx_to_vec_by_embeds(token_embeddings) - - def _set_idx_to_vec_by_embeds(self, token_embeddings): - """Sets the mapping between token indices and token embedding vectors. - - - Parameters - ---------- - token_embeddings : an instance or a list of instances of - :class:`~mxnet.text.embedding.TokenEmbedding` - One or multiple pre-trained token embeddings to load. If it is a - list of multiple embeddings, these embedding vectors will be - concatenated for each token. - """ - - self._vec_len = sum(embed.vec_len for embed in token_embeddings) - self._idx_to_vec = nd.zeros(shape=(len(self), self.vec_len)) - - col_start = 0 - # Concatenate all the embedding vectors in token_embeddings. - for embed in token_embeddings: - col_end = col_start + embed.vec_len - # Cancatenate vectors of the unknown token. - self._idx_to_vec[0, col_start:col_end] = embed.idx_to_vec[0] - self._idx_to_vec[1:, col_start:col_end] = embed.get_vecs_by_tokens( - self.idx_to_token[1:]) - col_start = col_end diff --git a/python/mxnet/text/indexer.py b/python/mxnet/text/indexer.py deleted file mode 100644 index bed2794b2941..000000000000 --- a/python/mxnet/text/indexer.py +++ /dev/null @@ -1,231 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 -# pylint: disable=consider-iterating-dictionary - -"""Text token indexer.""" -from __future__ import absolute_import -from __future__ import print_function - -from collections import Counter - -from . import constants as C - - -class TokenIndexer(object): - """Indexing for text tokens. - - - Build indices for the unknown token, reserved tokens, and input counter - keys. Indexed tokens can be used by instances of - :class:`~mxnet.text.embedding.TokenEmbedding`, such as instances of - :class:`~mxnet.text.glossary.Glossary`. - - - Parameters - ---------- - counter : collections.Counter or None, default None - Counts text token frequencies in the text data. Its keys will be indexed - according to frequency thresholds such as `most_freq_count` and - `min_freq`. Keys of `counter`, `unknown_token`, and values of - `reserved_tokens` must be of the same hashable type. Examples: str, int, - and tuple. - most_freq_count : None or int, default None - The maximum possible number of the most frequent tokens in the keys of - `counter` that can be indexed. Note that this argument does not count - any token from `reserved_tokens`. Suppose that there are different - keys of `counter` whose frequency are the same, if indexing all of them - will exceed this argument value, such keys will be indexed one by one - according to their __cmp__() order until the frequency threshold is - met. If this argument is None or larger than its largest possible value - restricted by `counter` and `reserved_tokens`, this argument has no - effect. - min_freq : int, default 1 - The minimum frequency required for a token in the keys of `counter` to - be indexed. - unknown_token : hashable object, default '' - The representation for any unknown token. In other words, any unknown - token will be indexed as the same representation. Keys of `counter`, - `unknown_token`, and values of `reserved_tokens` must be of the same - hashable type. Examples: str, int, and tuple. - reserved_tokens : list of hashable objects or None, default None - A list of reserved tokens that will always be indexed, such as special - symbols representing padding, beginning of sentence, and end of - sentence. It cannot contain `unknown_token`, or duplicate reserved - tokens. Keys of `counter`, `unknown_token`, and values of - `reserved_tokens` must be of the same hashable type. Examples: str, int, - and tuple. - - - Properties - ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices - are aligned. - unknown_token : hashable object - The representation for any unknown token. In other words, any - unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - """ - - def __init__(self, counter=None, most_freq_count=None, min_freq=1, - unknown_token='', reserved_tokens=None): - - # Sanity checks. - assert min_freq > 0, '`min_freq` must be set to a positive value.' - - if reserved_tokens is not None: - reserved_token_set = set(reserved_tokens) - assert unknown_token not in reserved_token_set, \ - '`reserved_token` cannot contain `unknown_token`.' - assert len(reserved_token_set) == len(reserved_tokens), \ - '`reserved_tokens` cannot contain duplicate reserved tokens.' - - self._index_unknown_and_reserved_tokens(unknown_token, reserved_tokens) - - if counter is not None: - self._index_counter_keys(counter, unknown_token, reserved_tokens, - most_freq_count, min_freq) - - def _index_unknown_and_reserved_tokens(self, unknown_token, - reserved_tokens): - """Indexes unknown and reserved tokens.""" - - self._unknown_token = unknown_token - # Thus, constants.UNKNOWN_IDX must be 0. - self._idx_to_token = [unknown_token] - - if reserved_tokens is None: - self._reserved_tokens = None - else: - self._reserved_tokens = reserved_tokens[:] - self._idx_to_token.extend(reserved_tokens) - - self._token_to_idx = {token: idx for idx, token in - enumerate(self._idx_to_token)} - - def _index_counter_keys(self, counter, unknown_token, reserved_tokens, - most_freq_count, min_freq): - """Indexes keys of `counter`. - - - Indexes keys of `counter` according to frequency thresholds such as - `most_freq_count` and `min_freq`. - """ - - assert isinstance(counter, Counter), \ - '`counter` must be an instance of collections.Counter.' - - unknown_and_reserved_tokens = set(reserved_tokens) \ - if reserved_tokens is not None else set() - unknown_and_reserved_tokens.add(unknown_token) - - token_freqs = sorted(counter.items(), key=lambda x: x[0]) - token_freqs.sort(key=lambda x: x[1], reverse=True) - - token_cap = len(unknown_and_reserved_tokens) + ( - len(counter) if most_freq_count is None else most_freq_count) - - for token, freq in token_freqs: - if freq < min_freq or len(self._idx_to_token) == token_cap: - break - if token not in unknown_and_reserved_tokens: - self._idx_to_token.append(token) - self._token_to_idx[token] = len(self._idx_to_token) - 1 - - def __len__(self): - return len(self.idx_to_token) - - @property - def token_to_idx(self): - return self._token_to_idx - - @property - def idx_to_token(self): - return self._idx_to_token - - @property - def unknown_token(self): - return self._unknown_token - - @property - def reserved_tokens(self): - return self._reserved_tokens - - def to_indices(self, tokens): - """Converts tokens to indices according to the text indexer. - - - Parameters - ---------- - tokens : str or list of strs - A source token or tokens to be converted. - - - Returns - ------- - int or list of ints - A token index or a list of token indices according to the text - indexer. - """ - - to_reduce = False - if not isinstance(tokens, list): - tokens = [tokens] - to_reduce = True - - indices = [self.token_to_idx[token] if token in self.token_to_idx - else C.UNKNOWN_IDX for token in tokens] - - return indices[0] if to_reduce else indices - - def to_tokens(self, indices): - """Converts token indices to tokens according to the text indexer. - - - Parameters - ---------- - indices : int or list of ints - A source token index or token indices to be converted. - - - Returns - ------- - str or list of strs - A token or a list of tokens according to the text indexer. - """ - - to_reduce = False - if not isinstance(indices, list): - indices = [indices] - to_reduce = True - - max_idx = len(self.idx_to_token) - 1 - - tokens = [] - for idx in indices: - if not isinstance(idx, int) or idx > max_idx: - raise ValueError('Token index %d in the provided `indices` is ' - 'invalid.' % idx) - else: - tokens.append(self.idx_to_token[idx]) - - return tokens[0] if to_reduce else tokens diff --git a/python/mxnet/text/utils.py b/python/mxnet/text/utils.py deleted file mode 100644 index 91e1b623ed5d..000000000000 --- a/python/mxnet/text/utils.py +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 - -"""Provide utilities for text data processing.""" -from __future__ import absolute_import -from __future__ import print_function - -from collections import Counter -import re - - -def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n', - to_lower=False, counter_to_update=None): - """Counts tokens in the specified string. - - For token_delim='' and seq_delim='', a specified string of two - sequences of tokens may look like:: - - token1token2token3token4token5 - - - Parameters - ---------- - source_str : str - A source string of tokens. - token_delim : str, default ' ' - A token delimiter. - seq_delim : str, default '\\\\n' - A sequence delimiter. - to_lower : bool, default False - Whether to convert the source source_str to the lower case. - counter_to_update : collections.Counter or None, default None - The collections.Counter instance to be updated with the token counts - of `source_str`. If None, return a new collections.Counter instance - counting tokens from `source_str`. - - - Returns - ------- - collections.Counter - The `counter_to_update` collections.Counter instance after being updated - with the token counts of `source_str`. If `counter_to_update` is None, - return a new collections.Counter instance counting tokens from - `source_str`. - - - Examples - -------- - >>> source_str = ' Life is great ! \\n life is good . \\n' - >>> count_tokens_from_str(token_line, ' ', '\\n', True) - Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2}) - """ - - source_str = filter(None, - re.split(token_delim + '|' + seq_delim, source_str)) - if to_lower: - source_str = [t.lower() for t in source_str] - - if counter_to_update is None: - return Counter(source_str) - else: - counter_to_update.update(source_str) - return counter_to_update diff --git a/tests/python/unittest/test_text.py b/tests/python/unittest/test_text.py deleted file mode 100644 index 96743040fff7..000000000000 --- a/tests/python/unittest/test_text.py +++ /dev/null @@ -1,743 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# 'License'); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 - -from __future__ import absolute_import -from __future__ import print_function - -from collections import Counter -import unittest - -from common import assertRaises -from mxnet import ndarray as nd -from mxnet.test_utils import * -from mxnet.text import utils -from mxnet.text.glossary import Glossary -from mxnet.text.indexer import TokenIndexer -from mxnet.text.embedding import TokenEmbedding, CustomEmbedding - - -def _get_test_str_of_tokens(token_delim, seq_delim): - seq1 = token_delim + token_delim.join(['Life', 'is', 'great', '!']) \ - + token_delim + seq_delim - seq2 = token_delim + token_delim.join(['life', 'is', 'good', '.']) \ - + token_delim + seq_delim - seq3 = token_delim + token_delim.join(['life', "isn't", 'bad', '.']) \ - + token_delim + seq_delim - seqs = seq1 + seq2 + seq3 - return seqs - - -def _test_count_tokens_from_str_with_delims(token_delim, seq_delim): - source_str = _get_test_str_of_tokens(token_delim, seq_delim) - - cnt1 = utils.count_tokens_from_str(source_str, token_delim, seq_delim, - to_lower=False) - assert cnt1 == Counter( - {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, - "isn't": 1, 'bad': 1}) - - cnt2 = utils.count_tokens_from_str(source_str, token_delim, seq_delim, - to_lower=True) - assert cnt2 == Counter( - {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, - "isn't": 1, 'bad': 1}) - - counter_to_update = Counter({'life': 2}) - - cnt3 = utils.count_tokens_from_str( - source_str, token_delim, seq_delim, to_lower=False, - counter_to_update=counter_to_update.copy()) - assert cnt3 == Counter( - {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, - "isn't": 1, 'bad': 1}) - - cnt4 = utils.count_tokens_from_str( - source_str, token_delim, seq_delim, to_lower=True, - counter_to_update=counter_to_update.copy()) - assert cnt4 == Counter( - {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, - "isn't": 1, 'bad': 1}) - - -def test_count_tokens_from_str(): - _test_count_tokens_from_str_with_delims(' ', '\n') - _test_count_tokens_from_str_with_delims('IS', 'LIFE') - - -def test_tokens_to_indices(): - counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - - indexer = TokenIndexer(counter, most_freq_count=None, min_freq=1, - unknown_token='', reserved_tokens=None) - - i1 = indexer.to_indices('c') - assert i1 == 1 - - i2 = indexer.to_indices(['c']) - assert i2 == [1] - - i3 = indexer.to_indices(['', 'non-exist']) - assert i3 == [0, 0] - - i4 = indexer.to_indices(['a', 'non-exist', 'a', 'b']) - assert i4 == [3, 0, 3, 2] - - -def test_indices_to_tokens(): - counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - - indexer = TokenIndexer(counter, most_freq_count=None, min_freq=1, - unknown_token='', reserved_tokens=None) - - i1 = indexer.to_tokens(1) - assert i1 == 'c' - - i2 = indexer.to_tokens([1]) - assert i2 == ['c'] - - i3 = indexer.to_tokens([0, 0]) - assert i3 == ['', ''] - - i4 = indexer.to_tokens([3, 0, 3, 2]) - assert i4 == ['a', '', 'a', 'b'] - - assertRaises(ValueError, indexer.to_tokens, 100) - - -def test_glove(): - glove_6b_50d = TokenEmbedding.create( - 'glove', pretrained_file_name='glove.6B.50d.txt') - - assert len(glove_6b_50d) == 400001 - assert glove_6b_50d.vec_len == 50 - assert glove_6b_50d.token_to_idx['hi'] == 11084 - assert glove_6b_50d.idx_to_token[11084] == 'hi' - - first_vec_sum = glove_6b_50d.idx_to_vec[0].sum().asnumpy()[0] - assert_almost_equal(first_vec_sum, 0) - - unk_vec_sum = glove_6b_50d.get_vecs_by_tokens( - '').sum().asnumpy()[0] - assert_almost_equal(unk_vec_sum, 0) - - unk_vecs_sum = glove_6b_50d.get_vecs_by_tokens( - ['', '']).sum().asnumpy()[0] - assert_almost_equal(unk_vecs_sum, 0) - - -def test_fasttext(): - fasttext_simple = TokenEmbedding.create( - 'fasttext', pretrained_file_name='wiki.simple.vec', - init_unknown_vec=nd.ones) - - assert len(fasttext_simple) == 111052 - assert fasttext_simple.vec_len == 300 - assert fasttext_simple.token_to_idx['hi'] == 3241 - assert fasttext_simple.idx_to_token[3241] == 'hi' - - first_vec_sum = fasttext_simple.idx_to_vec[0].sum().asnumpy()[0] - assert_almost_equal(first_vec_sum, fasttext_simple.vec_len) - - unk_vec_sum = fasttext_simple.get_vecs_by_tokens( - '').sum().asnumpy()[0] - assert_almost_equal(unk_vec_sum, fasttext_simple.vec_len) - - unk_vecs_sum = fasttext_simple.get_vecs_by_tokens( - ['', '']).sum().asnumpy()[0] - assert_almost_equal(unk_vecs_sum, fasttext_simple.vec_len * 2) - - -def _mk_my_pretrain_file(path, token_delim, pretrain_file): - path = os.path.expanduser(path) - if not os.path.exists(path): - os.makedirs(path) - seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' - seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' - seqs = seq1 + seq2 - with open(os.path.join(path, pretrain_file), 'w') as fout: - fout.write(seqs) - - -def _mk_my_pretrain_file2(path, token_delim, pretrain_file): - path = os.path.expanduser(path) - if not os.path.exists(path): - os.makedirs(path) - seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', - '0.05']) + '\n' - seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n' - seqs = seq1 + seq2 - with open(os.path.join(path, pretrain_file), 'w') as fout: - fout.write(seqs) - - -def _mk_my_pretrain_file3(path, token_delim, pretrain_file): - path = os.path.expanduser(path) - if not os.path.exists(path): - os.makedirs(path) - seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' - seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' - seq3 = token_delim.join(['', '1.1', '1.2', '1.3', '1.4', - '1.5']) + '\n' - seqs = seq1 + seq2 + seq3 - with open(os.path.join(path, pretrain_file), 'w') as fout: - fout.write(seqs) - - -def _mk_my_pretrain_file4(path, token_delim, pretrain_file): - path = os.path.expanduser(path) - if not os.path.exists(path): - os.makedirs(path) - seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', - '0.05']) + '\n' - seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', - '0.1']) + '\n' - seq3 = token_delim.join(['', '0.11', '0.12', '0.13', '0.14', - '0.15']) + '\n' - seqs = seq1 + seq2 + seq3 - with open(os.path.join(path, pretrain_file), 'w') as fout: - fout.write(seqs) - - -def _mk_my_invalid_pretrain_file(path, token_delim, pretrain_file): - path = os.path.expanduser(path) - if not os.path.exists(path): - os.makedirs(path) - seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' - seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' - seq3 = token_delim.join(['c']) + '\n' - seqs = seq1 + seq2 + seq3 - with open(os.path.join(path, pretrain_file), 'w') as fout: - fout.write(seqs) - - -def _mk_my_invalid_pretrain_file2(path, token_delim, pretrain_file): - path = os.path.expanduser(path) - if not os.path.exists(path): - os.makedirs(path) - seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' - seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' - seq3 = token_delim.join(['c', '0.6', '0.7', '0.8']) + '\n' - seqs = seq1 + seq2 + seq3 - with open(os.path.join(path, pretrain_file), 'w') as fout: - fout.write(seqs) - - -def test_custom_embed(): - embed_root = '~/.mxnet/embeddings/' - embed_name = 'my_embed' - elem_delim = '/t' - pretrain_file = 'my_pretrain_file.txt' - - _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file) - - pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) - - my_embed = CustomEmbedding(pretrain_file_path, elem_delim) - - assert len(my_embed) == 3 - assert my_embed.vec_len == 5 - assert my_embed.token_to_idx['a'] == 1 - assert my_embed.idx_to_token[1] == 'a' - - first_vec = my_embed.idx_to_vec[0] - assert_almost_equal(first_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) - - unk_vec = my_embed.get_vecs_by_tokens('A') - assert_almost_equal(unk_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) - - a_vec = my_embed.get_vecs_by_tokens('A', lower_case_backup=True) - assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5])) - - unk_vecs = my_embed.get_vecs_by_tokens(['', '']) - assert_almost_equal(unk_vecs.asnumpy(), - np.array([[0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]])) - - # Test loaded unknown vectors. - pretrain_file2 = 'my_pretrain_file2.txt' - _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file2) - pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2) - my_embed2 = CustomEmbedding(pretrain_file_path, elem_delim, - init_unknown_vec=nd.ones, - unknown_token='') - unk_vec2 = my_embed2.get_vecs_by_tokens('') - assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) - unk_vec2 = my_embed2.get_vecs_by_tokens('') - assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) - - my_embed3 = CustomEmbedding(pretrain_file_path, elem_delim, - init_unknown_vec=nd.ones, - unknown_token='') - unk_vec3 = my_embed3.get_vecs_by_tokens('') - assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) - unk_vec3 = my_embed3.get_vecs_by_tokens('') - assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) - - # Test error handling. - invalid_pretrain_file = 'invalid_pretrain_file.txt' - _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name), - elem_delim, invalid_pretrain_file) - pretrain_file_path = os.path.join(embed_root, embed_name, - invalid_pretrain_file) - assertRaises(AssertionError, CustomEmbedding, pretrain_file_path, - elem_delim) - - invalid_pretrain_file2 = 'invalid_pretrain_file2.txt' - _mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name), - elem_delim, invalid_pretrain_file2) - pretrain_file_path = os.path.join(embed_root, embed_name, - invalid_pretrain_file2) - assertRaises(AssertionError, CustomEmbedding, pretrain_file_path, - elem_delim) - - -def test_token_indexer(): - counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - - g1 = TokenIndexer(counter, most_freq_count=None, min_freq=1, - unknown_token='', reserved_tokens=None) - assert len(g1) == 5 - assert g1.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, - 'some_word$': 4} - assert g1.idx_to_token[1] == 'c' - assert g1.unknown_token == '' - assert g1.reserved_tokens is None - - g2 = TokenIndexer(counter, most_freq_count=None, min_freq=2, - unknown_token='', reserved_tokens=None) - assert len(g2) == 3 - assert g2.token_to_idx == {'': 0, 'c': 1, 'b': 2} - assert g2.idx_to_token[1] == 'c' - assert g2.unknown_token == '' - assert g2.reserved_tokens is None - - g3 = TokenIndexer(counter, most_freq_count=None, min_freq=100, - unknown_token='', reserved_tokens=None) - assert len(g3) == 1 - assert g3.token_to_idx == {'': 0} - assert g3.idx_to_token[0] == '' - assert g3.unknown_token == '' - assert g3.reserved_tokens is None - - g4 = TokenIndexer(counter, most_freq_count=2, min_freq=1, - unknown_token='', reserved_tokens=None) - assert len(g4) == 3 - assert g4.token_to_idx == {'': 0, 'c': 1, 'b': 2} - assert g4.idx_to_token[1] == 'c' - assert g4.unknown_token == '' - assert g4.reserved_tokens is None - - g5 = TokenIndexer(counter, most_freq_count=3, min_freq=1, - unknown_token='', reserved_tokens=None) - assert len(g5) == 4 - assert g5.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3} - assert g5.idx_to_token[1] == 'c' - assert g5.unknown_token == '' - assert g5.reserved_tokens is None - - g6 = TokenIndexer(counter, most_freq_count=100, min_freq=1, - unknown_token='', reserved_tokens=None) - assert len(g6) == 5 - assert g6.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, - 'some_word$': 4} - assert g6.idx_to_token[1] == 'c' - assert g6.unknown_token == '' - assert g6.reserved_tokens is None - - g7 = TokenIndexer(counter, most_freq_count=1, min_freq=2, - unknown_token='', reserved_tokens=None) - assert len(g7) == 2 - assert g7.token_to_idx == {'': 0, 'c': 1} - assert g7.idx_to_token[1] == 'c' - assert g7.unknown_token == '' - assert g7.reserved_tokens is None - - assertRaises(AssertionError, TokenIndexer, counter, most_freq_count=None, - min_freq=0, unknown_token='', - reserved_tokens=['b']) - - assertRaises(AssertionError, TokenIndexer, counter, most_freq_count=None, - min_freq=1, unknown_token='', - reserved_tokens=['b', 'b']) - - assertRaises(AssertionError, TokenIndexer, counter, most_freq_count=None, - min_freq=1, unknown_token='', - reserved_tokens=['b', '']) - - g8 = TokenIndexer(counter, most_freq_count=None, min_freq=1, - unknown_token='', reserved_tokens=['b']) - assert len(g8) == 5 - assert g8.token_to_idx == {'': 0, 'b': 1, 'c': 2, 'a': 3, - 'some_word$': 4} - assert g8.idx_to_token[1] == 'b' - assert g8.unknown_token == '' - assert g8.reserved_tokens == ['b'] - - g9 = TokenIndexer(counter, most_freq_count=None, min_freq=2, - unknown_token='', reserved_tokens=['b', 'a']) - assert len(g9) == 4 - assert g9.token_to_idx == {'': 0, 'b': 1, 'a': 2, 'c': 3} - assert g9.idx_to_token[1] == 'b' - assert g9.unknown_token == '' - assert g9.reserved_tokens == ['b', 'a'] - - g10 = TokenIndexer(counter, most_freq_count=None, min_freq=100, - unknown_token='', reserved_tokens=['b', 'c']) - assert len(g10) == 3 - assert g10.token_to_idx == {'': 0, 'b': 1, 'c': 2} - assert g10.idx_to_token[1] == 'b' - assert g10.unknown_token == '' - assert g10.reserved_tokens == ['b', 'c'] - - g11 = TokenIndexer(counter, most_freq_count=1, min_freq=2, - unknown_token='', reserved_tokens=['', 'b']) - assert len(g11) == 4 - assert g11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3} - assert g11.idx_to_token[1] == '' - assert g11.unknown_token == '' - assert g11.reserved_tokens == ['', 'b'] - - g12 = TokenIndexer(counter, most_freq_count=None, min_freq=2, - unknown_token='b', reserved_tokens=['']) - assert len(g12) == 3 - assert g12.token_to_idx == {'b': 0, '': 1, 'c': 2} - assert g12.idx_to_token[1] == '' - assert g12.unknown_token == 'b' - assert g12.reserved_tokens == [''] - - g13 = TokenIndexer(counter, most_freq_count=None, min_freq=2, - unknown_token='a', reserved_tokens=['']) - assert len(g13) == 4 - assert g13.token_to_idx == {'a': 0, '': 1, 'c': 2, 'b': 3} - assert g13.idx_to_token[1] == '' - assert g13.unknown_token == 'a' - assert g13.reserved_tokens == [''] - - counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), - ('c', 'c'), ('c', 'c'), ('c', 'c'), - ('some_word$', 'some_word$')]) - - g14 = TokenIndexer(counter_tuple, most_freq_count=None, min_freq=1, - unknown_token=('', ''), reserved_tokens=None) - assert len(g14) == 5 - assert g14.token_to_idx == {('', ''): 0, ('c', 'c'): 1, - ('b', 'b'): 2, ('a', 'a'): 3, - ('some_word$', 'some_word$'): 4} - assert g14.idx_to_token[1] == ('c', 'c') - assert g14.unknown_token == ('', '') - assert g14.reserved_tokens is None - - -def test_glossary_with_one_embed(): - embed_root = '~/.mxnet/embeddings/' - embed_name = 'my_embed' - elem_delim = '/t' - pretrain_file = 'my_pretrain_file1.txt' - - _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file) - - pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) - - my_embed = CustomEmbedding(pretrain_file_path, elem_delim, - init_unknown_vec=nd.ones) - - counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - - g1 = Glossary(counter, my_embed, most_freq_count=None, min_freq=1, - unknown_token='', reserved_tokens=['']) - - assert g1.token_to_idx == {'': 0, '': 1, 'c': 2, 'b': 3, 'a': 4, - 'some_word$': 5} - assert g1.idx_to_token == ['', '', 'c', 'b', 'a', 'some_word$'] - - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [0.6, 0.7, 0.8, 0.9, 1], - [0.1, 0.2, 0.3, 0.4, 0.5], - [1, 1, 1, 1, 1]]) - ) - - assert g1.vec_len == 5 - assert g1.reserved_tokens == [''] - - assert_almost_equal(g1.get_vecs_by_tokens('c').asnumpy(), - np.array([1, 1, 1, 1, 1]) - ) - - assert_almost_equal(g1.get_vecs_by_tokens(['c']).asnumpy(), - np.array([[1, 1, 1, 1, 1]]) - ) - - assert_almost_equal(g1.get_vecs_by_tokens(['a', 'not_exist']).asnumpy(), - np.array([[0.1, 0.2, 0.3, 0.4, 0.5], - [1, 1, 1, 1, 1]]) - ) - - assert_almost_equal(g1.get_vecs_by_tokens(['a', 'b']).asnumpy(), - np.array([[0.1, 0.2, 0.3, 0.4, 0.5], - [0.6, 0.7, 0.8, 0.9, 1]]) - ) - - assert_almost_equal(g1.get_vecs_by_tokens(['A', 'b']).asnumpy(), - np.array([[1, 1, 1, 1, 1], - [0.6, 0.7, 0.8, 0.9, 1]]) - ) - - assert_almost_equal(g1.get_vecs_by_tokens(['A', 'b'], - lower_case_backup=True).asnumpy(), - np.array([[0.1, 0.2, 0.3, 0.4, 0.5], - [0.6, 0.7, 0.8, 0.9, 1]]) - ) - - g1.update_token_vectors(['a', 'b'], - nd.array([[2, 2, 2, 2, 2], - [3, 3, 3, 3, 3]]) - ) - - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [3, 3, 3, 3, 3], - [2, 2, 2, 2, 2], - [1, 1, 1, 1, 1]]) - ) - - assertRaises(ValueError, g1.update_token_vectors, 'unknown$$$', - nd.array([0, 0, 0, 0, 0])) - - assertRaises(AssertionError, g1.update_token_vectors, '', - nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) - - assertRaises(AssertionError, g1.update_token_vectors, '', - nd.array([0])) - - g1.update_token_vectors([''], - nd.array([0, 0, 0, 0, 0]) - ) - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[0, 0, 0, 0, 0], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [3, 3, 3, 3, 3], - [2, 2, 2, 2, 2], - [1, 1, 1, 1, 1]]) - ) - g1.update_token_vectors([''], - nd.array([[10, 10, 10, 10, 10]]) - ) - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[10, 10, 10, 10, 10], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [3, 3, 3, 3, 3], - [2, 2, 2, 2, 2], - [1, 1, 1, 1, 1]]) - ) - g1.update_token_vectors('', - nd.array([0, 0, 0, 0, 0]) - ) - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[0, 0, 0, 0, 0], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [3, 3, 3, 3, 3], - [2, 2, 2, 2, 2], - [1, 1, 1, 1, 1]]) - ) - g1.update_token_vectors('', - nd.array([[10, 10, 10, 10, 10]]) - ) - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[10, 10, 10, 10, 10], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [3, 3, 3, 3, 3], - [2, 2, 2, 2, 2], - [1, 1, 1, 1, 1]]) - ) - - -def test_glossary_with_two_embeds(): - embed_root = '.' - embed_name = 'my_embed' - elem_delim = '/t' - pretrain_file1 = 'my_pretrain_file1.txt' - pretrain_file2 = 'my_pretrain_file2.txt' - - _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file1) - _mk_my_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file2) - - pretrain_file_path1 = os.path.join(embed_root, embed_name, pretrain_file1) - pretrain_file_path2 = os.path.join(embed_root, embed_name, pretrain_file2) - - my_embed1 = CustomEmbedding(pretrain_file_path1, elem_delim, - init_unknown_vec=nd.ones) - my_embed2 = CustomEmbedding(pretrain_file_path2, elem_delim) - - counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - - g1 = Glossary(counter, [my_embed1, my_embed2], most_freq_count=None, - min_freq=1, unknown_token='', reserved_tokens=None) - - assert g1.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, - 'some_word$': 4} - assert g1.idx_to_token == ['', 'c', 'b', 'a', 'some_word$'] - - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], - [0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0], - [0.1, 0.2, 0.3, 0.4, 0.5, - 0.01, 0.02, 0.03, 0.04, 0.05], - [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) - ) - - assert g1.vec_len == 10 - assert g1.reserved_tokens is None - assert_almost_equal(g1.get_vecs_by_tokens('c').asnumpy(), - np.array([1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1]) - ) - - assert_almost_equal(g1.get_vecs_by_tokens(['b', 'not_exist']).asnumpy(), - np.array([[0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) - ) - - g1.update_token_vectors(['a', 'b'], - nd.array([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2], - [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]) - ) - assert_almost_equal(g1.idx_to_vec.asnumpy(), - np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], - [3, 3, 3, 3, 3, 3, 3, 3, 3, 3], - [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], - [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) - ) - - # Test loaded unknown tokens - pretrain_file3 = 'my_pretrain_file3.txt' - pretrain_file4 = 'my_pretrain_file4.txt' - - _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file3) - _mk_my_pretrain_file4(os.path.join(embed_root, embed_name), elem_delim, - pretrain_file4) - - pretrain_file_path3 = os.path.join(embed_root, embed_name, pretrain_file3) - pretrain_file_path4 = os.path.join(embed_root, embed_name, pretrain_file4) - - my_embed3 = CustomEmbedding(pretrain_file_path3, elem_delim, - init_unknown_vec=nd.ones, - unknown_token='') - my_embed4 = CustomEmbedding(pretrain_file_path4, elem_delim, - unknown_token='') - - g2 = Glossary(counter, [my_embed3, my_embed4], most_freq_count=None, - min_freq=1, unknown_token='', reserved_tokens=None) - assert_almost_equal(g2.idx_to_vec.asnumpy(), - np.array([[1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.06, 0.07, 0.08, 0.09, 0.1], - [0.6, 0.7, 0.8, 0.9, 1, - 0.11, 0.12, 0.13, 0.14, 0.15], - [0.1, 0.2, 0.3, 0.4, 0.5, - 0.01, 0.02, 0.03, 0.04, 0.05], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15]]) - ) - - g3 = Glossary(counter, [my_embed3, my_embed4], most_freq_count=None, - min_freq=1, unknown_token='', reserved_tokens=None) - assert_almost_equal(g3.idx_to_vec.asnumpy(), - np.array([[1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.06, 0.07, 0.08, 0.09, 0.1], - [0.6, 0.7, 0.8, 0.9, 1, - 0.11, 0.12, 0.13, 0.14, 0.15], - [0.1, 0.2, 0.3, 0.4, 0.5, - 0.01, 0.02, 0.03, 0.04, 0.05], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15]]) - ) - - g4 = Glossary(counter, [my_embed3, my_embed4], most_freq_count=None, - min_freq=1, unknown_token='', reserved_tokens=None) - assert_almost_equal(g4.idx_to_vec.asnumpy(), - np.array([[1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.06, 0.07, 0.08, 0.09, 0.1], - [0.6, 0.7, 0.8, 0.9, 1, - 0.11, 0.12, 0.13, 0.14, 0.15], - [0.1, 0.2, 0.3, 0.4, 0.5, - 0.01, 0.02, 0.03, 0.04, 0.05], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15]]) - ) - - counter2 = Counter(['b', 'b', 'c', 'c', 'c', 'some_word$']) - - g5 = Glossary(counter2, [my_embed3, my_embed4], most_freq_count=None, - min_freq=1, unknown_token='a', reserved_tokens=None) - assert g5.token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3} - assert g5.idx_to_token == ['a', 'c', 'b', 'some_word$'] - assert_almost_equal(g5.idx_to_vec.asnumpy(), - np.array([[1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.06, 0.07, 0.08, 0.09, 0.1], - [0.6, 0.7, 0.8, 0.9, 1, - 0.11, 0.12, 0.13, 0.14, 0.15], - [1.1, 1.2, 1.3, 1.4, 1.5, - 0.11, 0.12, 0.13, 0.14, 0.15]]) - ) - - -def test_get_embedding_names_and_pretrain_files(): - assert len(TokenEmbedding.get_embedding_and_pretrained_file_names( - embedding_name='fasttext')) == 3 - - assert len(TokenEmbedding.get_embedding_and_pretrained_file_names( - embedding_name='glove')) == 10 - - reg = TokenEmbedding.get_embedding_and_pretrained_file_names( - embedding_name=None) - - assert len(reg['glove']) == 10 - assert len(reg['fasttext']) == 3 - - assertRaises(KeyError, - TokenEmbedding.get_embedding_and_pretrained_file_names, - 'unknown$$') - - -if __name__ == '__main__': - import nose - nose.runmodule()