From 9e4f9d18ec6827799f5b787f4c0697df7c017ef2 Mon Sep 17 00:00:00 2001 From: Aston Zhang <22279212+astonzhang@users.noreply.github.com> Date: Thu, 15 Mar 2018 09:39:05 -0700 Subject: [PATCH] Add vocabulary and embedding (#10074) * [MXNET-67] Sync master with v1.1.0 branch (#10031) * [REVIEW REQUIRED] Revert PR #9484 & add additional dependency licenses to LICENSE file (#9701) * Revert "[Review Required] Fixing Licenses: Cleaning up the Top Level LICENSE file (#9484)" This reverts commit 8930d96b265560a797c5554a9617f607cea7740f. * Some more LICENSE fixes * Adding some more packages to the LICENSE file * Adding dependencies of dependencies * update v1.1.0 change log to NEWS.md * sync README.md from v1.1.0 branch * revert to correct jenkins url in README * Parallelization for ROIpooling OP (#9958) * parallelization for roipooling * remove some useless computation * remove useless muls * add author and retriggering * retrigger again * comments to copy and copyto are corrected (#10040) * Bug Fix and performance optimized for rtc (#10018) * Bug Fix and performance optimized for rtc 1. "super().__init__()" bug is fixed in python 2. 2. Kernel is initialized in the stage of operator init. * Update custom_softmax_rtc.py fix unnessesary format * set embedding * Code and test revised * api implementation done * license and news * readme and cpp * pylint disable * Add API doc * less pylint disable * remove contrib * move to gluon, revise api doc * fix import order * re-test * relative imports * re-run test * revise implementation, test case, and api doc * re-test --- docs/api/python/gluon/text.md | 347 +++++++++++ python/mxnet/gluon/text/__init__.py | 26 + python/mxnet/gluon/text/_constants.py | 704 +++++++++++++++++++++++ python/mxnet/gluon/text/embedding.py | 581 +++++++++++++++++++ python/mxnet/gluon/text/utils.py | 77 +++ python/mxnet/gluon/text/vocab.py | 323 +++++++++++ tests/python/unittest/test_gluon_text.py | 681 ++++++++++++++++++++++ 7 files changed, 2739 insertions(+) create mode 100644 docs/api/python/gluon/text.md create mode 100644 python/mxnet/gluon/text/__init__.py create mode 100644 python/mxnet/gluon/text/_constants.py create mode 100644 python/mxnet/gluon/text/embedding.py create mode 100644 python/mxnet/gluon/text/utils.py create mode 100644 python/mxnet/gluon/text/vocab.py create mode 100644 tests/python/unittest/test_gluon_text.py diff --git a/docs/api/python/gluon/text.md b/docs/api/python/gluon/text.md new file mode 100644 index 000000000000..f9fcf3353563 --- /dev/null +++ b/docs/api/python/gluon/text.md @@ -0,0 +1,347 @@ +# Gluon Text API + +## Overview + +The `mxnet.gluon.text` APIs refer to classes and functions related to text data processing, such +as bulding indices and loading pre-trained embedding vectors for text tokens and storing them in the +`mxnet.ndarray.NDArray` format. + +This document lists the text APIs in `mxnet.gluon`: + +```eval_rst +.. autosummary:: + :nosignatures: + + mxnet.gluon.text.embedding + mxnet.gluon.text.vocab + mxnet.gluon.text.utils +``` + +All the code demonstrated in this document assumes that the following modules or packages are +imported. + +```python +>>> from mxnet import gluon +>>> from mxnet import nd +>>> from mxnet.gluon import text +>>> import collections + +``` + +### Indexing words and using pre-trained word embeddings in `gluon` + +As a common use case, let us index words, attach pre-trained word embeddings for them, and use +such embeddings in `gluon` in just a few lines of code. + +To begin with, suppose that we have a simple text data set in the string format. We can count word +frequency in the data set. + +```python +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.count_tokens_from_str(text_data) + +``` + +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +This allows us to filter out infrequent words (See details at +[Vocabulary API specifications](#mxnet.gluon.text.vocab.Vocabulary)). +Suppose that we want to build indices for all the keys in `counter`. We need a Vocabulary instance +with `counter` as its argument. + +```python +>>> my_vocab = text.Vocabulary(counter) + +``` + +To attach word embedding to indexed words in `my_vocab`, let us go on to create a fastText word +embedding instance by specifying the embedding name `fasttext` and the pre-trained file name +`wiki.simple.vec`. + +```python +>>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + +``` + +So we can attach word embedding `fasttext` to indexed words `my_vocab`. + +```python +>>> my_vocab.set_embedding(fasttext) + +``` + +Now we are ready to access the fastText word embedding vectors for indexed words, such as 'hello' +and 'world'. + +```python +>>> my_vocab.embedding[['hello', 'world']] + +[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + +``` + +To demonstrate how to use pre-trained word embeddings in the `gluon` package, let us first obtain +indices of the words 'hello' and 'world'. + +```python +>>> my_vocab[['hello', 'world']] +[2, 1] + +``` + +We can obtain the vector representation for the words 'hello' and 'world' by specifying their +indices (2 and 1) and the weight matrix `my_vocab.embedding.idx_to_vec` in +`mxnet.gluon.nn.Embedding`. + +```python +>>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape +>>> layer = gluon.nn.Embedding(input_dim, output_dim) +>>> layer.initialize() +>>> layer.weight.set_data(my_vocab.embedding.idx_to_vec) +>>> layer(nd.array([2, 1])) + +[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + +``` + +## Vocabulary + +The vocabulary builds indices for text tokens and can be attached with token embeddings. The input +counter whose keys are candidate indices may be obtained via +[`count_tokens_from_str`](#mxnet.gluon.text.utils.count_tokens_from_str). + + +```eval_rst +.. currentmodule:: mxnet.gluon.text.vocab +.. autosummary:: + :nosignatures: + + Vocabulary +``` + +Suppose that we have a simple text data set in the string format. We can count word frequency in the +data set. + +```python +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.utils.count_tokens_from_str(text_data) + +``` + +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +This allows us to filter out infrequent words. Suppose that we want to build indices for the 2 most +frequent keys in `counter` with the unknown token representation '(unk)' and a reserved token +'(pad)'. + +```python +>>> my_vocab = text.Vocabulary(counter, max_size=2, unknown_token='(unk)', +... reserved_tokens=['(pad)']) + +``` + +We can access properties such as `token_to_idx` (mapping tokens to indices), `idx_to_token` (mapping +indices to tokens), `unknown_token` (representation of any unknown token) and `reserved_tokens` +(reserved tokens). + + +```python +>>> my_vocab.token_to_idx +{'(unk)': 0, '(pad)': 1, 'world': 2, 'hello': 3} +>>> my_vocab.idx_to_token +['(unk)', '(pad)', 'world', 'hello'] +>>> my_vocab.unknown_token +'(unk)' +>>> my_vocab.reserved_tokens +['(pad)'] +>>> len(my_vocab) +4 +>>> my_vocab[['hello', 'world']] +[3, 2] +``` + +Besides the specified unknown token '(unk)' and reserved_token '(pad)' are indexed, the 2 most +frequent words 'world' and 'hello' are also indexed. + + +### Attach token embedding to vocabulary + +A vocabulary instance can be attached with token embedding. + +To begin with, suppose that we have a simple text data set in the string format. We can count word +frequency in the data set. + +```python +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.count_tokens_from_str(text_data) + +``` + +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +This allows us to filter out infrequent words. +Suppose that we want to build indices for the most frequent 2 keys in `counter`. + +```python +>>> my_vocab = text.Vocabulary(counter, max_size=2) + +``` + +Let us define the fastText word embedding instance with the pre-trained file `wiki.simple.vec`. + +```python +>>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + +``` + +So we can attach word embedding `fasttext` to indexed words `my_vocab`. + +```python +>>> my_vocab.set_embedding(fasttext) + +``` + +Now we are ready to access the fastText word embedding vectors for the indexed words. + +```python +>>> my_vocab.embedding[['hello', 'world']] + +[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + +``` + +Let us define the GloVe word embedding with the pre-trained file `glove.6B.50d.txt`. Then, +we can re-attach a GloVe text embedding instance to the vocabulary. + +```python +>>> glove = text.embedding.create('glove', file_name='glove.6B.50d.txt') +>>> my_vocab.set_embedding(glove) + +``` + +Now we are ready to access the GloVe word embedding vectors for the indexed words. + +```python +>>> my_vocab.embedding[['hello', 'world']] + +[[ -0.38497001 0.80092001 + ... + 0.048833 0.67203999] + [ -0.41486001 0.71847999 + ... + -0.37639001 -0.67541999]] + + +``` + +If a token is unknown to `my_vocab`, its embedding vector is initialized according to the default +specification in `glove` (all elements are 0). + +```python + +>>> my_vocab.embedding['nice'] + +[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. + ... + 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] + + +``` + + + +## Text token embedding + +To load token embeddings from an externally hosted pre-trained token embedding file, such as those +of GloVe and FastText, use +[`embedding.create(embedding_name, file_name)`](#mxnet.gluon.text.embedding.create). + +To get all the available `embedding_name` and `file_name`, use +[`embedding.get_file_names()`](#mxnet.gluon.text.embedding.get_file_names). + +```python +>>> text.embedding.get_file_names() +{'glove': ['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', ...], +'fasttext': ['wiki.en.vec', 'wiki.simple.vec', 'wiki.zh.vec', ...]} + +``` + +Alternatively, to load embedding vectors from a custom pre-trained text token embedding file, use +[`TokenEmbedding.from_file`](#mxnet.gluon.text.embedding.TokenEmbedding.from_file). + + +```eval_rst +.. currentmodule:: mxnet.gluon.text.embedding +.. autosummary:: + :nosignatures: + + register + create + get_file_names + TokenEmbedding + GloVe + FastText +``` + +See [Assign token embedding to vocabulary](#assign-token-embedding-to-vocabulary) for how to attach +token embeddings to vocabulary and use token embeddings. + + +### Implement a new text token embedding + +For ``embedding``, create a subclass of `mxnet.gluon.text.embedding.TokenEmbedding`. +Also add ``@mxnet.gluon.text.embedding.TokenEmbedding.register`` before this class. See +[`embedding.py`](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/text/embedding.py) +for examples. + + +## Text utilities + +The following functions provide utilities for text data processing. + +```eval_rst +.. currentmodule:: mxnet.gluon.text.utils +.. autosummary:: + :nosignatures: + + count_tokens_from_str +``` + + +## API Reference + + + +```eval_rst + +.. automodule:: mxnet.gluon.text.embedding + :members: register, create, get_file_names +.. autoclass:: mxnet.gluon.text.embedding.TokenEmbedding + :members: from_file +.. autoclass:: mxnet.gluon.text.embedding.GloVe +.. autoclass:: mxnet.gluon.text.embedding.FastText + +.. automodule:: mxnet.gluon.text.vocab +.. autoclass:: mxnet.gluon.text.vocab.Vocabulary + :members: set_embedding, to_tokens + +.. automodule:: mxnet.gluon.text.utils + :members: count_tokens_from_str + +``` + \ No newline at end of file diff --git a/python/mxnet/gluon/text/__init__.py b/python/mxnet/gluon/text/__init__.py new file mode 100644 index 000000000000..3c33272af404 --- /dev/null +++ b/python/mxnet/gluon/text/__init__.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import +"""This module includes utilities for indexing and embedding text.""" + +from .vocab import * + +from . import embedding + +from .utils import * diff --git a/python/mxnet/gluon/text/_constants.py b/python/mxnet/gluon/text/_constants.py new file mode 100644 index 000000000000..3457a294e5da --- /dev/null +++ b/python/mxnet/gluon/text/_constants.py @@ -0,0 +1,704 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 + +"""Constants.""" +from __future__ import absolute_import +from __future__ import print_function + +UNKNOWN_IDX = 0 + +GLOVE_PRETRAINED_FILE_SHA1 = \ + {'glove.42B.300d.zip': 'f8e722b39578f776927465b71b231bae2ae8776a', + 'glove.6B.zip': 'b64e54f1877d2f735bdd000c1d7d771e25c7dfdc', + 'glove.840B.300d.zip': '8084fbacc2dee3b1fd1ca4cc534cbfff3519ed0d', + 'glove.twitter.27B.zip': 'dce69c404025a8312c323197347695e81fd529fc'} + +GLOVE_PRETRAINED_ARCHIVE_SHA1 = \ + {'glove.42B.300d.txt': '876767977d6bd4d947c0f84d44510677bc94612a', + 'glove.6B.50d.txt': '21bf566a9d27f84d253e0cd4d4be9dcc07976a6d', + 'glove.6B.100d.txt': '16b1dbfaf35476790bd9df40c83e2dfbd05312f1', + 'glove.6B.200d.txt': '17d0355ddaa253e298ede39877d1be70f99d9148', + 'glove.6B.300d.txt': '646443dd885090927f8215ecf7a677e9f703858d', + 'glove.840B.300d.txt': '294b9f37fa64cce31f9ebb409c266fc379527708', + 'glove.twitter.27B.25d.txt': + '767d80889d8c8a22ae7cd25e09d0650a6ff0a502', + 'glove.twitter.27B.50d.txt': + '9585f4be97e286339bf0112d0d3aa7c15a3e864d', + 'glove.twitter.27B.100d.txt': + '1bbeab8323c72332bd46ada0fc3c99f2faaa8ca8', + 'glove.twitter.27B.200d.txt': + '7921c77a53aa5977b1d9ce3a7c4430cbd9d1207a'} + +FAST_TEXT_ARCHIVE_SHA1 = \ + {'crawl-300d-2M.zip': 'bb40313d15837ceecc1e879bc954e9be04b17c3c', + 'wiki.aa.zip': '0d85feb259e17d5258f38b2b615a2b87cd628427', + 'wiki.ab.zip': '7a8c555b9cf3837c9b31c901e9e0142209990365', + 'wiki.ace.zip': '51555fccbe53b726f6c86a84d704c026a78dd02f', + 'wiki.ady.zip': '725d2c30c03001c941ac4084549c55c7f8e1d766', + 'wiki.af.zip': '1a18d34e1b60433b837f5850750a44ca3845323d', + 'wiki.ak.zip': 'daecc2303cfd05bc6c33b24d78c14e0d7f33e3a7', + 'wiki.als.zip': '38851192e0b556e566be6c3c93370abf9867e525', + 'wiki.am.zip': '4576e0121448564b07f448e05e287236343f17c1', + 'wiki.ang.zip': '9c03da3b06d4becef5d387b9a61438b9362fc36a', + 'wiki.an.zip': '170f60bdd161cf8e4b5e018acd7d36e8bfc457a6', + 'wiki.arc.zip': 'c8dad8b00865bf736b087e7b323999ab404bda29', + 'wiki.ar.zip': '34e9869daa463fdc5609040ff33a03e67512e9fd', + 'wiki.arz.zip': '2d2790e11e401d46e1bce2970ee5264d5678a32b', + 'wiki.ast.zip': '1136515e2de556c077324bcd42ffe7f40c8d94c6', + 'wiki.as.zip': 'f9efde3e4ccda4a1e93fa275a3210f74036e9e46', + 'wiki.av.zip': '9f8568a3e094a48de4a3b6bea3bdb6fd7e875a08', + 'wiki.ay.zip': 'f09a422cedc6a0f15fbf30d290febe8057de83db', + 'wiki.azb.zip': 'd8895581050b9fdb5a10dfec3e27910a150b6faf', + 'wiki.az.zip': '2a34c2db872597ba3e345ce8b7db138241f9efbf', + 'wiki.bar.zip': 'd6e40135a6f4ba7a07fab11633034eccb1b05d0a', + 'wiki.bat_smg.zip': '5d08bd04f0515a36723776c0682b3de0f11d4264', + 'wiki.ba.zip': '412ac2f3bf9a605e56e2b0990bb0baed41ddf3b0', + 'wiki.bcl.zip': 'd3717cda357e08390cb57a64e07f5c7b7768d5be', + 'wiki.be.zip': 'b691e63b8080af23cc37f5f2b21b3154e464c425', + 'wiki.bg.zip': '08509a510a95e2a8905c19d83faf40d614d2268b', + 'wiki.bh.zip': 'a812600c6454b779d442b7680e3867e15d895095', + 'wiki.bi.zip': 'd0d4a3f57419424815f77b3951ef9c7336f6adf5', + 'wiki.bjn.zip': '0d81879ff7611380896eac6059bb677a5b3fe308', + 'wiki.bm.zip': 'f3a2a1a8dbc94973a74343c059595a310a66665b', + 'wiki.bn.zip': 'b3bc70520edf3963c2217873ff5c2537d3545650', + 'wiki.bo.zip': '2be9fe7701d6a8501461df7bd98fee26859cf83a', + 'wiki.bpy.zip': 'd44b9267bb4f86e3e43972a6a952cc0ccf90dd3c', + 'wiki.br.zip': '4bfa66f1ea5aa5cad736eccaa211f6025596bcd6', + 'wiki.bs.zip': '40c560c5994ab50485d08eeaffd88740f30236ab', + 'wiki.bug.zip': 'bc7cd87bb067ac477000259cd4f95f45bfb6e4df', + 'wiki.bxr.zip': '8396fd67ef53f3123540766788a0db54734c4f1a', + 'wiki.ca.zip': '8f5d3caf0f5d223b2771ec44f7e620e396974fb2', + 'wiki.cbk_zam.zip': '0af3be50823b564433455d10c8753df88461458f', + 'wiki.cdo.zip': '19024215aa0c13872c027fc6127b5d7506198b5f', + 'wiki.ceb.zip': '96374428bf36a43983ba4307d7f6fb5ab52a6c6a', + 'wiki.ce.zip': 'b27f1a8da448bc9315e15d4261519c64f00de8eb', + 'wiki.cho.zip': '20944e34c2b58f14adb849dd5a6f5168c7affdea', + 'wiki.chr.zip': 'b7f41ee3fa76e933e0b5ad6b793c507fc19afe98', + 'wiki.chy.zip': '4ef66004a609c724fd7d8aab2877f7634323d43f', + 'wiki.ch.zip': '7f73678b685c9b5f5d6eea9bc00322cfc18d40cb', + 'wiki.ckb.zip': 'b7db2805526ad8bed878af257b32ca9ba814855f', + 'wiki.co.zip': '1b9e19b11763cb87ca00520dbdd6ada565547c9c', + 'wiki.crh.zip': '792003bae25c4471d25721440002c983fa5af020', + 'wiki.cr.zip': '875e4aa0de8a829e57f6c8e13d43cac5103210de', + 'wiki.csb.zip': 'fa776014c4c83487d7cb2485bd08eaf6739d9dca', + 'wiki.cs.zip': 'dca18cb80460522cd281ccc3c9922cf2b3c08b81', + 'wiki.cu.zip': 'ed23b48ba3193181a358d7a73005afa7655a4fc3', + 'wiki.cv.zip': '27ccd50942c9c218e00365ee293fa0c3087a7646', + 'wiki.cy.zip': '78940d5be2969b82c99f785bda2ac5f4e18e149c', + 'wiki.da.zip': 'a45077d9d73328bd6a96efdba1b31ed9a3639dcd', + 'wiki.de.zip': '0d9e4bf80100b46237dcb73cfefe390103e7e827', + 'wiki.diq.zip': '0eef7d9e2f0ce3f100a22dc8fcede9449e466528', + 'wiki.dsb.zip': '903cd80550931effba1d4e52a19c22592837d11c', + 'wiki.dv.zip': '3fa06719641ff33ac8a5439d330a8108521da1e7', + 'wiki.dz.zip': '8bf3937971c3c996493c30b264cb8268627d7bd6', + 'wiki.ee.zip': 'e66bc50013d884fe69f4f67ba44af2e34fe97927', + 'wiki.el.zip': '3015f358036658fb126d42fa794d67a90c5b91ad', + 'wiki.eml.zip': '5be541be6115af5914ac2b8118a09232b771123b', + 'wiki.en.zip': '7f83d578a31a8168423c77ea25ad381494a5e920', + 'wiki.eo.zip': 'e7612df98c37cb872f0edc3c3e21dcd2f80a4d69', + 'wiki.es.zip': '1b7668b23db26810ea433173ce0c11281e801f74', + 'wiki.et.zip': 'aa31004e7b8ebf359e166b8ea6b8e6f77fac190f', + 'wiki.eu.zip': '8d7699451cbac4d69750caa8d58b4740cc72e0ca', + 'wiki.ext.zip': '3aeb4d77c48eb503b26ceb2a76a0a7d841124a71', + 'wiki.fa.zip': '08b6e805c8623fba526143d46f4685549c4380a6', + 'wiki.ff.zip': '64f690eda733a6fb4f794e42eb6ff05f09ec1d38', + 'wiki.fiu_vro.zip': '35c3fdcec0f0dc1ce303212967ea59936641daee', + 'wiki.fi.zip': '252299a2a59cc0ac07ba25f9458afc26bbac669f', + 'wiki.fj.zip': '004d1279c27324d02b961341cf0d6ee06dbe8966', + 'wiki.fo.zip': '12f1d6360d4867cdebcc93be87c024a4709d1af5', + 'wiki.frp.zip': '8a0f636b5440a9aab38014efada9edfdf94150d5', + 'wiki.frr.zip': '7c9e7b8109b98aa39b303dd77d837b37e96d4113', + 'wiki.fr.zip': 'd906e68760153d771e5982009b0150e913254b2d', + 'wiki.fur.zip': 'd5d2ae08696ed074a581eac563a60eb85467a792', + 'wiki.fy.zip': '342609d29882fae0a3b402d8ea1478606be0d93b', + 'wiki.gag.zip': 'f2b91f89dd9b9a1301727476f7823b7260b5f129', + 'wiki.gan.zip': 'd3ad3c1151555266e1feb9f98b066ee31ee5f410', + 'wiki.ga.zip': '798b0c26783c7af05d9c4f899ca9fddafeb1e0a1', + 'wiki.gd.zip': '49085fa182a528bdc51f10e99bef33c88c1e3112', + 'wiki.glk.zip': '9e16727ffcc691483b69ecbcd331b1df2efa4bcd', + 'wiki.gl.zip': 'c71c7e6601b2cbdc7930982fbeea636deddd107d', + 'wiki.gn.zip': '493ccb583211217ccd23e0a43f42ba773bd94f78', + 'wiki.gom.zip': '45bbd49750ddb7df5afe01fcfd5dda2958934dfa', + 'wiki.got.zip': '669d018f72827fb965e5ef37e224e21f4682b2e5', + 'wiki.gu.zip': '4afe874f7830f693e9f83508fc3fb444b33aebdf', + 'wiki.gv.zip': '9411197eebc07775949d9bb6e440780a68502a5c', + 'wiki.hak.zip': 'cd1e14bd5d50fa764883b148bda5b821375531e0', + 'wiki.haw.zip': 'cacd4eb4e476bdd842e8014764b8ae380b346ed2', + 'wiki.ha.zip': '14acc50950b451f40fe028fd08d042af44732398', + 'wiki.he.zip': 'a9e2cd13bc2e55d83820c529bac1f518a7198bc0', + 'wiki.hif.zip': 'dcdd488239deb0ede807cff263ddc972009c21f5', + 'wiki.hi.zip': '15899ec17985bc0e1db1df497e1b4a51bba1982b', + 'wiki.ho.zip': 'fde454bb4f3841ea5dde2bbf879138305a4d0b36', + 'wiki.hr.zip': 'f5d33ba967f7c56538fa9f5f0093f6d634e9db44', + 'wiki.hsb.zip': '64dc13c7645d2b65b8ba252bd8dfb1c616e8923a', + 'wiki.ht.zip': 'cf50a5cadcf91aba9ab58d095d65f348e2375d12', + 'wiki.hu.zip': 'b27f293caedf81a2d09204b11f52a7c8d7443643', + 'wiki.hy.zip': '641b8666bc2168998989fae1b20a09d3428766bb', + 'wiki.hz.zip': '1639f9f096de6fac84336a784a391ce73e523d62', + 'wiki.ia.zip': '37640aaf8a25c02883190951337b5a6f0157d781', + 'wiki.id.zip': '56ee0c7a38a6d232706932493eaa37b2a87667ee', + 'wiki.ie.zip': '7c3a5d7f96c801570e2305f45a40d401fcc038b9', + 'wiki.ig.zip': '405ebc2e8a959163c9f2f8dd015a0bcefd440111', + 'wiki.ii.zip': '1ec1c7d95d61eeca2dbbd8e432caf88524aaf28e', + 'wiki.ik.zip': 'e9d088c0d8d0ab420d6d0469c6a0fdb668f1833c', + 'wiki.ilo.zip': 'cbc9754978ce55e86da2eb3db20579f4a1f19947', + 'wiki.io.zip': '9e5ab1fd5c4f1094d111f501129e0eecccec69a0', + 'wiki.is.zip': '0744e63636cf794e0a406c922827628a3dd415b7', + 'wiki.it.zip': '29f4eb6a5d7dcf45b02b4d08a4a70dfae4c41200', + 'wiki.iu.zip': 'fb2e8de825d554257768d363a3a09f711afb001b', + 'wiki.jam.zip': '077cfb6de9d025aee4a5b2ea9ce15ada02f10a4f', + 'wiki.ja.zip': '7940f6c2bc490c04902f0faf0562b92cae7136bf', + 'wiki.jbo.zip': '3d086b6c9a369f197516cd0dc699a94612f45c6a', + 'wiki.jv.zip': '2f68cb3436b27a25ddfa40fab3e2cd44574b437e', + 'wiki.kaa.zip': '9fd5df362b7cb615f2267084d8b3fb8608be2693', + 'wiki.kab.zip': '96abf1440ad21de58d7274d3a16885ef4a2efda4', + 'wiki.ka.zip': '72ddb2382c87184fc05a93e89ed8aa4f54a62a0a', + 'wiki.kbd.zip': '81dfc3c6f8581c2aa15342c84688b4ba59b81cc6', + 'wiki.kg.zip': '4d07cabef6f804fc6432d3f630675ed4cbbdd49e', + 'wiki.ki.zip': '59b5c31df227ff9454ad8b3a1d16b065620dbddf', + 'wiki.kj.zip': '751b80c4a4d82dd217d3d2b3905eb39b349874d7', + 'wiki.kk.zip': '7fb733a2405f421a7c49b756381a52965a8af205', + 'wiki.kl.zip': '05a9d5c9bf12d8845356f88b546418d2e40f79c6', + 'wiki.km.zip': 'da0a67028fa0244a2e7257ae259c2f7a7544dc66', + 'wiki.kn.zip': '6cead946350b31fb2f353085fd00b8ea9c9ecc77', + 'wiki.koi.zip': '0c61f83434404267527eaf583e89b4d8bb3a6a65', + 'wiki.ko.zip': 'c0825282faf1e7af6820bd8b28d06c77760dcbe4', + 'wiki.krc.zip': '0df3c3f0f89521299dab741be3d698b2c94c194e', + 'wiki.kr.zip': '71651f046cef420fb28ca15e35720bb7747c4586', + 'wiki.ksh.zip': '8b9ab88baa49e72e40a5a80bef98f3ea2afbdd07', + 'wiki.ks.zip': '02af37f12753662c9e7bcac3b8786dfd2f298710', + 'wiki.ku.zip': 'ca1d370b327ceca025884bf83139456024a3a978', + 'wiki.kv.zip': '28b3617c5566f3182f14bf11a906456b227840ba', + 'wiki.kw.zip': '075a02e8eaae26897c23898fb4d36f4e41e4d1d0', + 'wiki.ky.zip': '771601a934cd4d0a98e5059f6389d2496e8dcf7c', + 'wiki.lad.zip': '2788ba3f275d72299e877c96cde106bd8590f405', + 'wiki.la.zip': '759f6365874442ab8e04d992b047f53ad74231a6', + 'wiki.lbe.zip': 'c8105f1cf8a3d46ccfacff1d40a581f442b3c4a1', + 'wiki.lb.zip': 'dac5af52364f2c0d3a0c794411465d1254f2fb48', + 'wiki.lez.zip': '17331cb779dee8cb60f2734213af80d57acfcfad', + 'wiki.lg.zip': 'fd4e2d67d1f098474053abc9a1984dfe4a2854b7', + 'wiki.lij.zip': 'c29157f5e4d2b37c01cf6e389f03ddafef6acdb2', + 'wiki.li.zip': '10490e49a12230af2127543da69c427f92c6508f', + 'wiki.lmo.zip': 'cc44163572deddd78af6b006394f623cb21934fc', + 'wiki.ln.zip': 'bf52699c5cbf79bedb2e2856d8a720189b6864f3', + 'wiki.lo.zip': '3fd8a70d8e26071a365f10016875a4a4f15ffcee', + 'wiki.lrc.zip': 'e262b4fcc55cba48d997cd06d006b82a5abe09a9', + 'wiki.ltg.zip': 'df6a83f2fab35f9a2f97fd8d857cb1cfa59f331f', + 'wiki.lt.zip': 'a738a3f29a6a5481082a7a9a41b2040b9cf537e4', + 'wiki.lv.zip': '8e328d99aacaa021fcc51425caebc063e22e6cf4', + 'wiki.mai.zip': 'e909de86c27eced2cb5f02f550da7fc2502b5eda', + 'wiki.map_bms.zip': '192bf6b88f955746abb398893868482730585e3a', + 'wiki.mdf.zip': '3d0d5da3c85bef8ae52f0fd17e314a1960a26d36', + 'wiki.mg.zip': 'fe66055b63ce8771bf43f8dd543bbd967f8ea8b3', + 'wiki.mhr.zip': '33514c98da3bd9602851db96fa3dd8192aac0674', + 'wiki.mh.zip': 'dc77309103c6cfed7ff095b3f9f158e1ae437e71', + 'wiki.min.zip': '8b925eea6df0411ee09baef5801d807cfec8cfa4', + 'wiki.mi.zip': 'd57831e8d7cb2ec260fc9d83d4281f0bacfb29a5', + 'wiki.mk.zip': 'b1fc2d85527e99530a93e3bbc5fa9fcde89910f3', + 'wiki.ml.zip': 'b9d53b8e76a05f5e959afd190da3015b36793297', + 'wiki.mn.zip': '715bf0ee67b48ec872659380fcf63ad006ddcc7e', + 'wiki.mo.zip': 'fb273fe373eb61310051d94ad6911320f573d0ec', + 'wiki.mrj.zip': 'b0d1e43e37e1718c8e05fd81a511095636def361', + 'wiki.mr.zip': '67e942a7742cc957298c8cd0cd0af0531dc936d7', + 'wiki.ms.zip': 'e218f113702b039fc8e80a77b894cd9fa4eff77d', + 'wiki.mt.zip': 'd68d5b636eac07b2e1307186c2c05b9a80e39658', + 'wiki.multi.ar.zip': '31c7b742c63c3367e9bce5c4dca37d5ceb33f1a6', + 'wiki.multi.bg.zip': '8991e8123bce7fd6c8e4510c71ede5715ae36f01', + 'wiki.multi.ca.zip': '0786e071438150485d394a4bf2e976d3a1b313ff', + 'wiki.multi.cs.zip': '7237f291146e69f0fc7002a0e175c7fd003d44e8', + 'wiki.multi.da.zip': '5591c20015191101aee190c02738c99073a8fe76', + 'wiki.multi.de.zip': '986160e51a08f4a93f1573d17352e375cbaedd6d', + 'wiki.multi.el.zip': '570eb12811ce61f6176f263eff3e945be69e7da0', + 'wiki.multi.en.zip': '2c3ef35d8338d4a905e7d10645572ab7a6730d44', + 'wiki.multi.es.zip': 'c1db7c7175665a7230f92ed038b78de780e060e9', + 'wiki.multi.et.zip': '54d0515865c754331b445dd9ba0ae7ed79b770aa', + 'wiki.multi.fi.zip': 'c94abc803a42b89cd75b278114b1f2cf4e2f3ecd', + 'wiki.multi.fr.zip': 'd4904b79eaf8ae386a7011ad84afc9b4238c9928', + 'wiki.multi.he.zip': '370ec2a379eecc2d2e984cde3e0f6d0a027eade7', + 'wiki.multi.hr.zip': 'd3f25ae76b040ffa09e964f6edc55488f6086394', + 'wiki.multi.hu.zip': '4b64bcdf0fc1f01bbd8427bd7bf6b46319308e7a', + 'wiki.multi.id.zip': '3ad5f590d5c847b35a334f1bdb48b9c466f5de68', + 'wiki.multi.it.zip': '18746450e665e96c33f2e2026986f643a27e0945', + 'wiki.multi.mk.zip': '1d899f1449d8729b7dbae226f05151a656694626', + 'wiki.multi.nl.zip': 'ff0a04dbb07c2cdbc61d5a241175e30ed46b48d4', + 'wiki.multi.no.zip': 'd1af729024181e64f58ae37ab233fc53811e2601', + 'wiki.multi.pl.zip': '91c3984c4f3158b1cb1ff11d8cc4f9240631266e', + 'wiki.multi.pt.zip': 'a1782c4fa4337008f82c0e2bf78e4323d145be29', + 'wiki.multi.ro.zip': 'b1a0840d084009ce00c47a3c24c984648dbe8785', + 'wiki.multi.ru.zip': '540607ba4334dab6089de463f974861aac8a35ae', + 'wiki.multi.sk.zip': '2a2bb39e011cf2bf6dcb8cb6c482b8eb9764eea3', + 'wiki.multi.sl.zip': '99442dab442dc196c107868db9174c78e270db1e', + 'wiki.multi.sv.zip': 'b40be83d2d7c27633c712aea62ceec0d409cc03a', + 'wiki.multi.tr.zip': 'e2bffab1616f54d180ba3d8bfe5e94ec9a489184', + 'wiki.multi.uk.zip': 'e97f64d9ba2b58a5e80c9b896b87340aba1e0eb0', + 'wiki.multi.vi.zip': '532fa24d8787a8906fb04a88e74a713b00cb33ec', + 'wiki.mus.zip': '1bb0cad10889b8a3bfa36c36c7da1f2fb2237bb8', + 'wiki.mwl.zip': 'e3d1fd1fa6290521d403e84eba577e552e330844', + 'wiki.myv.zip': '64a6505691441778766b7941b5e7f45a624a64a5', + 'wiki.my.zip': '491ce8dbf174d4abff758db4950f49eda90883d9', + 'wiki.mzn.zip': '76abf410749fd4516ead20ced891b54245fcd4a3', + 'wiki.nah.zip': '0496592cdd70eaf61b257fb5345843d38f425592', + 'wiki.nap.zip': 'f0df66cdbef5734f0afeb806cda631722fb426d8', + 'wiki.na.zip': '2456e4776b5e985cfaedfac244e0b40cff4e613c', + 'wiki.nds_nl.zip': 'ffd10e05b749281634eb7a758102d8d6ff42760e', + 'wiki.nds.zip': '2455e9fa4294828b25b32bdad7307a105f9fbe1d', + 'wiki-news-300d-1M-subword.zip': '697f4c8f37443be3aee7b96abe28fd7ebec95ef3', + 'wiki-news-300d-1M.zip': '567ef9c2e207be25da23e61312e6ba620da30466', + 'wiki.new.zip': 'a781885678cc1079d4be221c414339eb9bee8d19', + 'wiki.ne.zip': '180b068343288cda40d012aaa99d29459d341eb4', + 'wiki.ng.zip': '6db8111ab700f7b0841af87f1f1453341048014e', + 'wiki.nl.zip': '582420f290947cf38503b7f4b8ea9bb21918005e', + 'wiki.nn.zip': '4a0e30376b361ee19800e6d897a865572e330f84', + 'wiki.nov.zip': 'ac98c0300302019ff855698561708abd81730db3', + 'wiki.no.zip': '6893a7912ab3756e31d09ef1f9023c27c0b047f8', + 'wiki.nrm.zip': 'bd27aadf25a165ebbac486437ea6a06b710fdda6', + 'wiki.nso.zip': 'c55dfebb83351c952831db34e779e0a380212f05', + 'wiki.nv.zip': 'cf122e5ee041287917c594a2cb6cd247978f1ec0', + 'wiki.ny.zip': '9086021a60babd7e87afa469dbadb004523f5fd2', + 'wiki.oc.zip': '15075544cf837135127d8688cd06fb8e4c8b7f3d', + 'wiki.olo.zip': '523628bb652e1563b4dd5a94b518addf10699f74', + 'wiki.om.zip': 'a29360ab3930d889c4eb5b385589f84c1ff9f06e', + 'wiki.or.zip': 'a782e649ae5307dece445b0c11b15ffb9ce88297', + 'wiki.os.zip': '0d76ca005afd48b87dea5c9784c4c48bb51d3e3e', + 'wiki.pag.zip': 'b046ef71badc9d7eec161e3aec2ffc3abb7bad20', + 'wiki.pam.zip': 'abed25ef407e05209f2653d571bba5bc7c66e7b3', + 'wiki.pap.zip': '5d099bfc65c85f824634a191ce33e8e42f947ded', + 'wiki.pa.zip': '2066ed0016720b9f8779f55f2cc2de08511025f6', + 'wiki.pcd.zip': '66914c99e5531c0484448b84568971362cdad0f6', + 'wiki.pdc.zip': '6ed181fa1f8782917ae7849490c0a5cb0b0b9b29', + 'wiki.pfl.zip': '8d271226af8509962b15a96c4d6e41d9aabd972c', + 'wiki.pih.zip': '365955dbecb17027435fe487ab92a7a267fa25bd', + 'wiki.pi.zip': 'eeb863545392c92cff0f3e3d9c3f61539d3fa1dd', + 'wiki.pl.zip': '2b0cae8af2637bc24b958e6757149d1b9f8c8fea', + 'wiki.pms.zip': '9eff2e96e1cb9bf02adf816c4feb5aa3cd1a384f', + 'wiki.pnb.zip': '23f77d1d9469f5b2c342984288cb3092d53d8dee', + 'wiki.pnt.zip': '84cc9532d2fd7b322bcba91e01ac36c9a719e23a', + 'wiki.ps.zip': '18c9ffb2a81cbc25299b26e35170a29b7de9309c', + 'wiki.pt.zip': '37752109a44829de5ea10b173d7c0cecc0b1a0d7', + 'wiki.qu.zip': '5582c07eeeaec10d9382b3ab90d2921fc97fa2e0', + 'wiki.rmy.zip': 'a106ab536001e92e7a9708417faee9418f4058d0', + 'wiki.rm.zip': '67a324941f2b895a418fbd89314a18bfda19b1de', + 'wiki.rn.zip': 'ce17294909c046e90bb0131632e1d795d1771816', + 'wiki.roa_rup.zip': 'a9a378e90cd46353283c92cfb7d34dd485a018d2', + 'wiki.roa_tara.zip': '953fe4cf1667cbb9b3b8e11666885bfedf74b411', + 'wiki.ro.zip': '6bbb0f9452398416d9183e00e6cd091a02fb351f', + 'wiki.rue.zip': 'e9f9b8ab63c7722b4b68e8c465b1c69436132553', + 'wiki.ru.zip': 'f8f68aa5792941d7750b545e56f1ff5127e88cc2', + 'wiki.rw.zip': '018b9fb76fca5ce7a3e1f266df33fcc1bbc50493', + 'wiki.sah.zip': 'f6c94dbd3b719b154217388310fab72e5a69f823', + 'wiki.sa.zip': '4dc78b48d651056546d14b659c6598770c6bce77', + 'wiki.scn.zip': '218ba35c042cb3e179988bac9acf51cccf37422b', + 'wiki.sco.zip': 'daa8cedbb223e87d48f720aed9ce63dd0c81c632', + 'wiki.sc.zip': '909cc5160cad60fda34ab89c2b87ae4229402eeb', + 'wiki.sd.zip': '5468ed141bf2f1d9b1f8d7b31fee926b496ea9db', + 'wiki.se.zip': '0eb962f8768d88ffcbde3aac833e134a263c2055', + 'wiki.sg.zip': '651035aa74dc2f515253444f48aa9911094f9d27', + 'wiki.sh.zip': 'cf3057b61bd5bca6f47640801681d451aee210cf', + 'wiki.simple.zip': '367737535e39defb0e713a7ff2374cb932c5a9bc', + 'wiki.si.zip': 'cebb2f4011b0d679fe856c5950076e3c48496ecc', + 'wiki.sk.zip': '6c43758d0c0f52351210c558cc33266a65709068', + 'wiki.sl.zip': 'd0239eefc830e5919bef8d9173a884e9e7371e7a', + 'wiki.sm.zip': '2e3cf33f17b449c8f81cc9ea4c84d542cfd23a14', + 'wiki.sn.zip': '4d3844ee350ee0065e5fe910a3f669ef863a2fc9', + 'wiki.so.zip': '9da45db9b21d1f27c4f73152539c1e4fc9b1c49c', + 'wiki.sq.zip': '0db976ec147df49e648cf8256562371d0ae6f2f0', + 'wiki.srn.zip': '120e229d522cc22008c50e0eb74b23d9f6eca51d', + 'wiki.sr.zip': '63b67391158bdd7a642f7d8412771c22e1041744', + 'wiki.ss.zip': '4368f7931f6730a6e8cb9b5794906f2d827582a8', + 'wiki.stq.zip': 'fb1ba577bf6fb7f7fcdc52bf392e63ed8492465d', + 'wiki.st.zip': 'b7e96392b3880c19e210fd42bc72e3f76c07a4c3', + 'wiki.su.zip': '4c4880cfca1ff954c88e44a32f201218eb2be146', + 'wiki.sv.zip': 'e2b10091585f795dd18289c4a65a1da591a78196', + 'wiki.sw.zip': '726631d8998ba1647d040e6b70f4bad7b8d8c367', + 'wiki.szl.zip': 'a70de974cff95cad0443f5faa6c8412c92998100', + 'wiki.ta.zip': '6bafd0bb523f654038393ba191012527745b940b', + 'wiki.tcy.zip': 'b4bd573eaf9fd87300a25648b38a053161d12c39', + 'wiki.tet.zip': '7e5608958977164e544850a5a169f5d55cd47a20', + 'wiki.te.zip': '948e5a6ec13ac95b595c3f52a6e7b9642a56c530', + 'wiki.tg.zip': '5b46429024d6819f6b511a4924b90c958615d40e', + 'wiki.th.zip': 'b8ee0878cec41b4ab1055a17d0ed669de1ed9afd', + 'wiki.ti.zip': 'd55abb74bb3ff195d2293ee9e77886111ee50e52', + 'wiki.tk.zip': '20263f39a31a1d55343f9dea7aecaa2860aefde8', + 'wiki.tl.zip': '2f2b809017249f8c4f8d5eb62979b58f16e8732b', + 'wiki.tn.zip': '0aa11b07b1ad6437bc1e9b6476d51ddd35dad994', + 'wiki.to.zip': '6b90b32ae258a56e67b42736675236b91163b3ad', + 'wiki.tpi.zip': 'ca9591e621ae667a1521d0bb5275435d45e974cc', + 'wiki.tr.zip': '3b6f86c2a115c7adec1b073b1f5624890e680148', + 'wiki.ts.zip': '8a00b16f2881977ad6f8c8665316c27fcab9b842', + 'wiki.tt.zip': '8d2f559bf1e09180d6dc4b127d61815a27670a20', + 'wiki.tum.zip': '5b3f6f3d8cae4d9534cd1fd3afc2f64ec8342b8d', + 'wiki.tw.zip': '7c189fabfcdb2973178c25d35fd10e46ee7148aa', + 'wiki.tyv.zip': '5e3811a19bbf961a5361ac37ff3502287c9ab022', + 'wiki.ty.zip': 'a7f31f8cabf4282533773aa7e63f294315cc85ea', + 'wiki.udm.zip': '643df5ab0914535e46e6839845d0ab585c81a119', + 'wiki.ug.zip': 'a5388269893ac4c7da28b2284f3536ca0f3c9341', + 'wiki.uk.zip': 'fdc9b0a0ab806e5845e9d89b8887ec9d555a0547', + 'wiki.ur.zip': '75579eb5609ea31d79bc2d1bd81d01f48e01bc7c', + 'wiki.uz.zip': 'aa149200f8c6e3e8bb5aa3c67112675d136900b8', + 'wiki.vec.zip': '58c4c9528154e256fbefeb97b8c1675356079f74', + 'wiki.vep.zip': '966b371afcc383058a5fbc6ee8f822620f03feac', + 'wiki.ve.zip': '6450e3ec2c78980c5a41d71ff159aa27918dda75', + 'wiki.vi.zip': 'bfa287fbb358a66b4f9576585df3e46607e1595c', + 'wiki.vls.zip': '7335bfda43890f42e045b8a5de25d1a8629fe012', + 'wiki.vo.zip': 'c2ca18bea165cb1253c1d88fa9958a25088fc84b', + 'wiki.war.zip': '5cda8fdd64e3acf5488ad361b68a63fb23747559', + 'wiki.wa.zip': '2e538c10a0e9f43ea5875c90a8ce01a07c4695a7', + 'wiki.wo.zip': 'f54c65ab63f98ffec7b3fb5bdd51a814034bd673', + 'wiki.wuu.zip': '68d9ad802836737392d62056231bf1b7a58594c9', + 'wiki.xal.zip': 'fb39fed41ccba2e4e58ab7714a53aae3695dbe04', + 'wiki.xh.zip': 'd37caa4d94e66588879231d0826798d8aa4b0a44', + 'wiki.xmf.zip': '956c43bca0d88e9348099cde43d58898e43d9f27', + 'wiki.yi.zip': '151c1670c48e976e4202272b066d7080a8c83615', + 'wiki.yo.zip': 'fdbd0fc6e35bb04c3aef1fa6f0262ba261b11199', + 'wiki.za.zip': '11f6a5dcb49c4d0571d5ac4fb3d7dda1d378fc06', + 'wiki.zea.zip': '22159a722c5c0390bad9206eb75e6e166efe38e9', + 'wiki.zh_classical.zip': 'c689d61d2254caf1ecec0909249523b09a737717', + 'wiki.zh_min_nan.zip': '0516a413565484d924a4c8b50c690d39344cdb64', + 'wiki.zh_yue.zip': '464f4c1c2039194cbae7502ed3a2eeff4df9e34f', + 'wiki.zh.zip': '2374ec566f6411b9bb570077636695fe9768a5ba', + 'wiki.zu.zip': 'a6d0325dab37cd551e6d7f6c783dd13f4c71db2f'} + +FAST_TEXT_FILE_SHA1 = \ + {'crawl-300d-2M.vec': '9b556504d099a6c01f3dd76b88775d02cb2f1946', + 'wiki.aa.vec': '5cce30fc85471572c498f278bbe495184577363e', + 'wiki.ab.vec': '9d89a403a9a866d3da8dd8cfab849f59ee499343', + 'wiki.ace.vec': '85d00074f7a08626f39da6a0c8a5cfa250096ab9', + 'wiki.ady.vec': '9d17d74f0348224cdebf8a831e61af0825f8952d', + 'wiki.af.vec': '999e64bcd8dab8de42cb1feceeca360def35324d', + 'wiki.ak.vec': '6092b8af335c2dc93e8df2bbf1d715f01e637bb4', + 'wiki.als.vec': '96052e96870695cca50857b5fde5f9f42219139a', + 'wiki.am.vec': 'dff7fcdd8f5ba0638ab9e1758a89800766156d72', + 'wiki.ang.vec': 'a7c30e02422d97d23a0701279c5c1c03159130a5', + 'wiki.an.vec': '5b4c2b1de5c04e4e0be83841410ca84c47305d21', + 'wiki.arc.vec': 'fd3ad743103f80cde9cfc048d7ca509e50efb35a', + 'wiki.ar.vec': 'c46e2142f799cc385bd25f0c0a8943ca565505a4', + 'wiki.arz.vec': '5e904087043b91f4945dd708f4230fdf51360132', + 'wiki.ast.vec': '89a90357101953b7c292697fd050c00fe5c38ac5', + 'wiki.as.vec': 'cad5883b5147cbe6cdbf604f65cabdb675a59258', + 'wiki.av.vec': '99976a63ca8c4231f808fd4314f0433db35e290d', + 'wiki.ay.vec': 'be359dad25b2c742d3abfa94c5f5db13f86c730e', + 'wiki.azb.vec': 'e23af0a436b97434813c3cb14ed114cc5b352faa', + 'wiki.az.vec': '9581d55d9056ad398a153c37b502f3a07867d091', + 'wiki.bar.vec': '96130f1f2e5bffdd06c202ad4472e5234020980a', + 'wiki.bat_smg.vec': 'cb3aef58da2011183b39fca64cabf3d9d7a62f4b', + 'wiki.ba.vec': '22147ee16b2d163cc88d09a035264fd0c10dab68', + 'wiki.bcl.vec': 'd4117b5c443438ddfa608b10a5be2c2501817e7e', + 'wiki.be.vec': '6cf81322cd7b046a7f02ec4c4960ad27045383fa', + 'wiki.bg.vec': '7c1cc6d0c52b038e4b7173259b0c009f242cf486', + 'wiki.bh.vec': 'ab2d29017afa015c49566a6d9bf75393c23ac4c0', + 'wiki.bi.vec': '15785220cd6e6c86cc87e7d3f3322a5541a4fe5d', + 'wiki.bjn.vec': '5f134cf288e8042dcd048a3ee76159aab42c7288', + 'wiki.bm.vec': 'f36a19c95e90865f6518d4487e59f363b47bd865', + 'wiki.bn.vec': '6fc3bfd9af455719f55bee0bea31b11afc70cf06', + 'wiki.bo.vec': '2e9358e03dcfa09da23d2e1499d84b10348fd8a9', + 'wiki.bpy.vec': 'c2bb15487c4bdb8fa869772694300ae1fee73896', + 'wiki.br.vec': 'df44e16abd2017e2a1b6c6588ee02779b19907f6', + 'wiki.bs.vec': 'c4943a290819ceae1611dd11179b40aab0df0471', + 'wiki.bug.vec': '942d8f7dadde5faa33aa72862501434f48e29f60', + 'wiki.bxr.vec': 'eaf767690c6b194605ae778719212e3874873d4c', + 'wiki.ca.vec': 'f5971edee11c939f6a7accfd33a9a45caa54141a', + 'wiki.cbk_zam.vec': '6fef47b4559eec402ce371de20dfb018acd6347d', + 'wiki.cdo.vec': '95e8196bf76323dbabab1b8a49ba4d677af3ccea', + 'wiki.ceb.vec': 'b8516a55537b8f80c927d77d95cdf7e4ff849a05', + 'wiki.ce.vec': '1d94b0168a773895b23889f7f07d7cf56c11a360', + 'wiki.cho.vec': 'cec6778f025fa9ae4134046c6c3a6291bd9c63f9', + 'wiki.chr.vec': '8501bf86b41074ed6c8d15b9209ef7ce83122e70', + 'wiki.ch.vec': '46803f3a1734f6a7b0d8cb053bbb86a6915d02e9', + 'wiki.chy.vec': '26c87688551ffe3a0c7a5952e894306651e62131', + 'wiki.ckb.vec': 'adb2fef309f1d93f429442b9c16c1564192c58f3', + 'wiki.co.vec': 'af876a918594e5541207bc12f17bfc4268df7b93', + 'wiki.crh.vec': 'c0d2310a1207fcacc94b25b149420b33bf835015', + 'wiki.cr.vec': '61dd9f044b7dfa56dcf1c3c07c7504c569420528', + 'wiki.csb.vec': '649cb2692f08414987c875dc331022567d367497', + 'wiki.cs.vec': 'f3ec1502aeee6a550d8cf784273fa62f61419a4e', + 'wiki.cu.vec': 'ddadb14ea00ea1dda716ee33732497ec049b526f', + 'wiki.cv.vec': '9cdb0bee5a0fea030def85597dba7108f21b0424', + 'wiki.cy.vec': '32d976a9bfc4dd6e39328c906eead0f597bd9e25', + 'wiki.da.vec': '526947dab1ffbc1465c7a766f2bca4de50676b08', + 'wiki.de.vec': '2ed2696afe55f023b0040b238d9a47e5fedfe48b', + 'wiki.diq.vec': '77f3c370d1d77806fafe368cf788af550ff607dd', + 'wiki.dsb.vec': 'e49a647a441fbf011ac5411dd6005e8725b9a65d', + 'wiki.dv.vec': 'e135ba97c711a021bc3317db2b95db5212c17658', + 'wiki.dz.vec': '24888f0b2cd156360bfb5e9e905240163ba798d8', + 'wiki.ee.vec': 'afd1670655daa7ffba51187a415fdd0b43f1d487', + 'wiki.el.vec': '6f034271390feaa6f9d7d16f933ddef637755979', + 'wiki.eml.vec': 'de6be7a2ffdda226eec730dd54b4c614bd7f5dca', + 'wiki.en.vec': 'c1e418f144ceb332b4328d27addf508731fa87df', + 'wiki.eo.vec': 'b56998fd69f66755b722a9481a9bdaf10f62c9aa', + 'wiki.es.vec': '2f41401aa0925167176bcd7a6770423d891dfef5', + 'wiki.et.vec': '64d56b66c02d5e49b1b66a85854d67d2dd9ebd41', + 'wiki.eu.vec': '5e72f4ef93666971fea5d2180b354e0a0821ba91', + 'wiki.ext.vec': '456c5632b13a0f136cd180ebe2dda67b83f78397', + 'wiki.fa.vec': '09b6cc685c895c66b853af9617787d3ab0891e2c', + 'wiki.ff.vec': '12b09d695f5fb8de4b5da9d36a73eb178b293a04', + 'wiki.fiu_vro.vec': '168a71a2b1c478e6810fa5dce9612d8bf8a273dc', + 'wiki.fi.vec': '91d19baae994d7e556b5b5938be2dc6013f9c706', + 'wiki.fj.vec': '36d36dc14001a109926bfc633594f6a2f7401697', + 'wiki.fo.vec': 'eead8ddc7bb74b12b16784723abf802bb51f844d', + 'wiki.frp.vec': '0eb70a613ccf807c7308c1f62535f0606465029d', + 'wiki.frr.vec': 'cde62af939cb2de35e341cef2c74813802a58ed4', + 'wiki.fr.vec': 'b092229005a65d8683a4112852fe6eb8161a6917', + 'wiki.fur.vec': 'd4a595cffa1abcdcf4229ba15277179ce5d20bc6', + 'wiki.fy.vec': 'd4beef537b7ff142a3986513879ff51a9ec14a7b', + 'wiki.gag.vec': 'c82ec7a5d081f0673661824f4fc34345dee255f0', + 'wiki.gan.vec': '7e53a33b7bd5b0360ea4cb452145616c09445029', + 'wiki.ga.vec': 'caaa5b2167a499893313ac1aa38416a6a0fe9a24', + 'wiki.gd.vec': 'f4b513598a1bf0f0d5b6521ea8ce363e9596cb97', + 'wiki.glk.vec': '20a7759075916e10531f5b3577302353cef565cd', + 'wiki.gl.vec': '8888bb8f3d70b36729b9ae479fe3765e0c083862', + 'wiki.gn.vec': '98594af7897c5a1f35885ddecc77556a7e7ae981', + 'wiki.gom.vec': '5a1193d9e5d49d06354c14e2b7c01bea176e13f1', + 'wiki.got.vec': 'dfa06de83a0e3099027c57b84561d7d990ea8310', + 'wiki.gu.vec': 'f9e13452eb63d92bea44c7c3db8fba9945c7000e', + 'wiki.gv.vec': '993a7ee31bdacc91763dad656aa6c2947b873473', + 'wiki.hak.vec': '9e83512d34c7f81739492bf0abbb25ff1ef88573', + 'wiki.ha.vec': '677a24efeeb1bcb8c0a931407775f18b18e875ae', + 'wiki.haw.vec': '58fea5aa1b37723797d26fb3d050ce6176757240', + 'wiki.he.vec': '55534560247394669e3f5c169136770c93bc2708', + 'wiki.hif.vec': '49697cf784814d3f1a47559724028e0fc0940d36', + 'wiki.hi.vec': '8049bb8604bc049d48bd934e27b0e184c480a413', + 'wiki.ho.vec': '9c75a09e099213aa8cd1f1020b223427537cbdd8', + 'wiki.hr.vec': '0c96f9af092cf8a84b03aec1426cd23921671489', + 'wiki.hsb.vec': '3dc7830544c58535bed308c552d609e13b973502', + 'wiki.ht.vec': '5039dfb58a074ac046813f2dae81159be8c5213f', + 'wiki.hu.vec': 'cd777e9efca3d4bd97c89f01690cfa4840d9c46f', + 'wiki.hy.vec': '21f9259d04cfd22db446a45d3622af225f00cf20', + 'wiki.hz.vec': '2a94b1390d68027748a05169fbc0c11a9a183456', + 'wiki.ia.vec': '2a348dc924638efc20c34785852b0837364aed76', + 'wiki.id.vec': 'c49d5c9bec89114599427f6c12a5bda2e5523dfd', + 'wiki.ie.vec': '01b0d11c0e7397418e73853d220e97bdcf7a8961', + 'wiki.ig.vec': 'd2d1643b4fb1a18a4d002cf2969073f7f201b3b2', + 'wiki.ii.vec': '41c6cd68b3ebe4ece2a06c37b06dca5d07c9fb3a', + 'wiki.ik.vec': 'af31cbec7b839f50fa70553ec63c58f7067d3ea8', + 'wiki.ilo.vec': 'c0e43835a3f4e0033ea5d7c6ff189982b2f26a05', + 'wiki.io.vec': 'af0c480c5872bff31d82e767c1116da2a6be0c00', + 'wiki.is.vec': 'ae0b018f92b3e218f2dacb2045a8f0a0446788a5', + 'wiki.it.vec': 'ac4a985e85ffae48047034e2603d804bf126caa9', + 'wiki.iu.vec': '5d51b2ba215005216ae003f4a6d6ef39fb30ca2e', + 'wiki.jam.vec': '6d51e384c56330097c2531fdbf4e74418909e388', + 'wiki.ja.vec': '7a2b1af1e46d795410692a002e40fa3085135f69', + 'wiki.jbo.vec': 'c90481946aa4b6b304528292612ae620f6549f3e', + 'wiki.jv.vec': '2ff7927d3ff04b8208133497b3778ede00ea463f', + 'wiki.kaa.vec': 'd990d3b9bd511d2d630f923099a6b9110231b2ed', + 'wiki.kab.vec': 'e3b73d41267d8d4cd42f6cc5a0c05dc4e021bf74', + 'wiki.ka.vec': '8b92b73f27f9b77818211e053a33985589de7c62', + 'wiki.kbd.vec': 'f5b8dbe47a7fae702232b5680b070ef6e865539e', + 'wiki.kg.vec': '1550647b6059e6eb649b100e31c53bd0661117b2', + 'wiki.ki.vec': 'c4e373e2ea13f7fa1e95b0733365e4b3fc8b2cc8', + 'wiki.kj.vec': 'c27e563683f9c96ff6f680a6d6bb9e9e2f9960d0', + 'wiki.kk.vec': '6343b2b31bad2e13d03a110b91c38fab4adc01cd', + 'wiki.kl.vec': 'e5def7fb1b56c5956b6e951e912d53ba0ff089f8', + 'wiki.km.vec': '64f7fff1df90b1f7241b232e901f76223a3719e0', + 'wiki.kn.vec': '32763f4f860f0d081f3aabf3e7d17b7858e7d877', + 'wiki.koi.vec': '4001f0617fe0fdd3b22116b304f497b7b16c6e4c', + 'wiki.ko.vec': '042c85a788c2778cca538cf716b8a78f0d7fa823', + 'wiki.krc.vec': '0c6ef043d51e5f337a309804f1db180fa0bb2cb8', + 'wiki.kr.vec': '25d5b4d5911a819c48328c48fb346417d07d4070', + 'wiki.ksh.vec': '4c3bb4f12073532b6fb7cc6c2be5e53319ef5b65', + 'wiki.ks.vec': '5056a87c4ee2d8bf0792436fc6b2b61648014de9', + 'wiki.ku.vec': '4d3a2401527dd9ba6be2b0cd31f6cd3edebadce9', + 'wiki.kv.vec': '164dc44d701b9d606a45f0b0446076adc3858dca', + 'wiki.kw.vec': 'f9eaa35a7e4f077f6de85c7801f74582f91b52c1', + 'wiki.ky.vec': '13b0ae3f23822317a0243bd9182105c631c834b3', + 'wiki.lad.vec': 'c510e520cde97050bf1cbeb36f2b90e6348ceed4', + 'wiki.la.vec': '9ea6286a0581084533db8d6ee96e0b7d15166543', + 'wiki.lbe.vec': '283619d93255571f14fd4545bb0577979171b990', + 'wiki.lb.vec': 'b146f23628c84e64314a35a5b6cc65a33777e22d', + 'wiki.lez.vec': '8e579b984a500ad89fc66767bfd7319766bd669b', + 'wiki.lg.vec': 'b096f5248dfbb343dc4696c97ea253510e1c4ef9', + 'wiki.lij.vec': '4ff5bb405c820e4119f0636efc301da15a08c00a', + 'wiki.li.vec': '0fb9ec4ac93676d8ef651692062bc3d7f6ae0843', + 'wiki.lmo.vec': 'a89414d9ceee4823622258f18936f67faf7e06e7', + 'wiki.ln.vec': '70b6a286b42958e25cb80824e0d8f1aee2de6dde', + 'wiki.lo.vec': '7c83f82b80c49b8eab21f62ecdb3681b8bda40a6', + 'wiki.lrc.vec': 'c1ae4fb79a19d44bfe8f601f0a30fbec841fa612', + 'wiki.ltg.vec': 'ec2f13d1290bd54afcaa74569e66e43e9bfef264', + 'wiki.lt.vec': '58d3ebef24e5e31be1a8318b45c08ebb16ad775a', + 'wiki.lv.vec': 'ef6b549f96e22718f513d47a611d3d6bc001a164', + 'wiki.mai.vec': '7f513ff36e485b19f91f83b30c32dd82e9e497f6', + 'wiki.map_bms.vec': 'e7deab5fdd38fa3331b1bcb4a16432b38c512e21', + 'wiki.mdf.vec': 'b16099ce0283a241339716eac41cfd99fdea7f36', + 'wiki.mg.vec': '0808252740909d6129f672584311263e7b2adadc', + 'wiki.mhr.vec': '39f62e292336cabc364f0d1913540b881b406393', + 'wiki.mh.vec': '7d2d8bff722fe0a5d869d9da11792a406aff3dc3', + 'wiki.min.vec': '3bb0fa596cf27a1d165c55684bebdc8d40cb8ad7', + 'wiki.mi.vec': 'e8acf9c7c2ab840a192c563aa776201a88e4ca89', + 'wiki.mk.vec': '85a3d3f13fa88ffde023d2326c65bdded4983dff', + 'wiki.ml.vec': '2b70fe76e8cf199a18551de782784a21e8db0b66', + 'wiki.mn.vec': '7cef7ecdf9d98484d9b598b25d0e717dba6acfd9', + 'wiki.mo.vec': 'cc54b661aefabdf516b49d24acb51273b3acf210', + 'wiki.mrj.vec': 'aa1c1ecba1ffd6b42c8d9659a8a04ab328ae1650', + 'wiki.mr.vec': '2cd6cf88bfdfb24850d345749ce0cfea8d65829e', + 'wiki.ms.vec': '458e1a079799a54cdc0a7b78c7fa1729d2683a6d', + 'wiki.mt.vec': '81f4c1d84dd4cc4276d59cb903fcc9aba46be981', + 'wiki.multi.ar.vec': 'f1f12cc9d629382af574a3db74fe49c2fd615c8f', + 'wiki.multi.bg.vec': '22470e664e4b35761a33c64433ea2f0c12140673', + 'wiki.multi.ca.vec': 'bc8d98b4d86d740d1985d73d211d887d561bcdd7', + 'wiki.multi.cs.vec': '17358b62e63f96b0479d6a70e9235a0421493884', + 'wiki.multi.da.vec': 'ebc75f428714d26fb1fa31accce49ad3b31e273b', + 'wiki.multi.de.vec': 'b9a63406aedf4446b467b94d12674bfe4723b52d', + 'wiki.multi.el.vec': '03d33db85bf83f35b943ce93b18c02fa98a0bc05', + 'wiki.multi.en.vec': '696719afdbe470ee4a2eb668229486dba1df19cc', + 'wiki.multi.es.vec': '98c9e35564ec57fee5dbc6155890150452f45d3f', + 'wiki.multi.et.vec': 'db10189093387e853f2fd3978770e1cc7bc07820', + 'wiki.multi.fi.vec': '746916885a1c7d4ec3f139a32cf267f9e15f5363', + 'wiki.multi.fr.vec': 'fe1535827b631d934beb02f8d36ba901b2c94a46', + 'wiki.multi.he.vec': '6dd112f018165317da22971a2b6fdb2a15dafa91', + 'wiki.multi.hr.vec': 'ff9f23cf595ec8dd93cd93c6b48049730c34253b', + 'wiki.multi.hu.vec': '6da405c9b048f3cbb990bfb29ef149f0430aa2e7', + 'wiki.multi.id.vec': '34edadab182682198c37ade8538530c545635742', + 'wiki.multi.it.vec': 'c55802bd73d46a6fc86771097670e02a70b5d46d', + 'wiki.multi.mk.vec': 'cec8550503ebca0bdc7ad11f2c15085b7072a990', + 'wiki.multi.nl.vec': 'c3f45a5fe8a8bc213cdf35dce51651b752ca60c4', + 'wiki.multi.no.vec': '105236df530c8fc2ce5b1e2550a2059bbc46fc28', + 'wiki.multi.pl.vec': '676eb5acb22982c0c9a7d6e4c90d26730c6d120e', + 'wiki.multi.pt.vec': '625b0a5384873c79a5dcfff5ee3fde49a3a65013', + 'wiki.multi.ro.vec': '82bd59674509b69f988f9870e3a291836ba43e84', + 'wiki.multi.ru.vec': 'a7d9c5f2ab2abb448a5111d352caa921adabe830', + 'wiki.multi.sk.vec': '98d849ee77f0320472cc5afa002bfde129be7089', + 'wiki.multi.sl.vec': 'fb5cfb8a9c44380d74fb21ddd204e820c4e05c31', + 'wiki.multi.sv.vec': '95d6cc3ba23dffff9be6adb467b617dd57780cb2', + 'wiki.multi.tr.vec': 'ecb0e353eaccba3fcacc6994d93065934ef429e9', + 'wiki.multi.uk.vec': '35f4f5a1ead8bd66bcaf865021fc3aae94456ab6', + 'wiki.multi.vi.vec': 'b1abe06360e1d65a0db65dd41ead7b2f9d651ea0', + 'wiki.mus.vec': 'fa1066f7bd09df4589993ca498c19aeb6cf986fd', + 'wiki.mwl.vec': '3d10a218242b94fcc3981aa3beb012b701827a55', + 'wiki.my.vec': 'e7c7989e32b23ca1a9caf534cc65ecaf9e1b9112', + 'wiki.myv.vec': '7de0927fd3d65677de7f770b3bd57c73b58df85d', + 'wiki.mzn.vec': 'aefad49237808acab99e1ca8eeaaf531666f261d', + 'wiki.nah.vec': 'c52e01cf4479fb7ec91ef39f298e8f97aeb6496e', + 'wiki.nap.vec': '6c9bd8ce1e85ee679b25189fd6f6d36afb119b6c', + 'wiki.na.vec': '8a592eb3dbe5693372714dff495d01cabc3ea215', + 'wiki.nds_nl.vec': '1cd96d12e78e5cd3f65ca2773a17696bda387b9f', + 'wiki.nds.vec': '7bf293149c08226e05bcf0442ac6e601162b9ffd', + 'wiki.ne.vec': '1045d7876f947cd4602d9ca79f7c4323a5d3a52d', + 'wiki-news-300d-1M-subword.vec': '717a3058e0ba5ef3cde52c3df0d4f0f60b0a113a', + 'wiki-news-300d-1M.vec': '11cac9efe6f599e659be182f5766d6fbd5b1cab9', + 'wiki.new.vec': '51f6c0b4ef1aee9fad4ab1cb69a7479db35e39a5', + 'wiki.ng.vec': 'c3016cc07d40bd43bea84b7c600244ff3d2a928e', + 'wiki.nl.vec': 'd796ee27e37b7d1d464e03c265c31ab62b52533e', + 'wiki.nn.vec': '35aeab89ffeca0377accbbd3bf18b81913c75448', + 'wiki.no.vec': 'd52e8019d7cc48569c8c3b514d2b1bd10261b5c0', + 'wiki.nov.vec': '5455c6e8463b1c43dd073e3e177702fb9a1dd834', + 'wiki.nrm.vec': 'b4cb941b126b26fa045c5fc75a490a31a969101c', + 'wiki.nso.vec': 'a906271509c2b343df35d1471509492bbfa883aa', + 'wiki.nv.vec': 'f5a6ea213bfe95c82cb22b53b4965df8b67ffeab', + 'wiki.ny.vec': '3aec3dcaea6c35f8254c407621644f87df37e411', + 'wiki.oc.vec': 'cc1833492899d75571148c2c305591f53d63f0b1', + 'wiki.olo.vec': 'cbadb4cada4dc579d0becdac93dfb479d76bf6c8', + 'wiki.om.vec': '91789a8d9f9284f7e71e4bb8d9a60eae4af4adca', + 'wiki.or.vec': 'a6b120fe536b6c0133b077dca0043c3bc97eef0b', + 'wiki.os.vec': '791b26cc300e9a1f0a08c7b2213a264e41ce30d6', + 'wiki.pag.vec': '03f71faf060c4eb33802275279967349c0337553', + 'wiki.pam.vec': '8fbd31e70d0ca0c61eb1a152efaa8ecb29180967', + 'wiki.pap.vec': '8cd98267cc55a4f9de80212e29651ddf7a9e83fd', + 'wiki.pa.vec': '4939d0db77a5b28d7d5aab0fab4f999d93b2053e', + 'wiki.pcd.vec': 'd2e8e7321b6f1bce94c563cb8ef8af2b45cc3e48', + 'wiki.pdc.vec': '401e24d0fb9b0ae9e06a5c700684361f58727fcf', + 'wiki.pfl.vec': '0ad9b7f3ae13f909f12835107432fee4c4ed3031', + 'wiki.pih.vec': '4ae6ef2a9c6c88e9322eda900e0f58be5592a29b', + 'wiki.pi.vec': 'd388db284357042f4260e1a567cb489b05bb8e0b', + 'wiki.pl.vec': 'd031adb6f83eda0364a861dcbf5ef779b5951c0b', + 'wiki.pms.vec': 'e30bda8d33d61db43243c157b9ac2feeaff316c8', + 'wiki.pnb.vec': '35f38862d3d83012d6db7baa8a4105e3e0a416e7', + 'wiki.pnt.vec': '38134772012d68f247e34daf220d9d4ed3e7f489', + 'wiki.ps.vec': '64f1bec5d5b937289199ceae2e1da6557ce48852', + 'wiki.pt.vec': '7f11ebdb0cbf5929b38319f1e977d2c13bcd741b', + 'wiki.qu.vec': '58de8c8290e8bc8f2a6a677312e28457113437b2', + 'wiki.rm.vec': '5d3144b47a0dd98648a6df0636384ab2a010ad7b', + 'wiki.rmy.vec': '3d36d3485961900c23355a0f7c2ba656a8558c29', + 'wiki.rn.vec': '80b6171b78dd932f59f70dbef074abb906af4eee', + 'wiki.roa_rup.vec': 'e31a44353cd84b976586c8df35a2ab58318120f0', + 'wiki.roa_tara.vec': 'b3fcb01ff0bac53a0ba08c5c0c411f26ee83a95a', + 'wiki.ro.vec': 'c088ea2752d5ec8b42e32410c191a14839ae8a1f', + 'wiki.rue.vec': 'fe539e0ea0bbbfd3ee06bd0c5521a035c7361ec5', + 'wiki.ru.vec': '7514a2c60ee4118abb451ed32a0d61cb52dec384', + 'wiki.rw.vec': 'af2ec410da6519a86ba21004c8b4c7fde768a91c', + 'wiki.sah.vec': '202470467194a1cbdcd571b14ef68371a29b38d9', + 'wiki.sa.vec': '7fed78d1d7674453b9876ee99aeeeba85ea46699', + 'wiki.scn.vec': 'bde043a235551e1643506774c5d9b61ecf2fc424', + 'wiki.sco.vec': '4625a5ad90a57f994be9b3aa4f8f3ecda941a821', + 'wiki.sc.vec': 'dba8dc7754ef04b1ba0cd702d94eea9575cde91c', + 'wiki.sd.vec': '36852d1253496e598fbd9b9009f07f454a6bea5b', + 'wiki.se.vec': 'f46b35ee6b893c2f12dd1b929bbc2b8120cbcd8d', + 'wiki.sg.vec': '90ece136bef7ad6e4e97776a1c7238499544405d', + 'wiki.sh.vec': '016691ecb26ace442731d92b1265e5c6c3d8ca5f', + 'wiki.simple.vec': '55267c50fbdf4e4ae0fbbda5c73830a379d68795', + 'wiki.si.vec': 'd05ed6a0bc1ee56e5d2e5f881d47372095f6eb0c', + 'wiki.sk.vec': '98759aacf7352d49a51390fae02030776510ae13', + 'wiki.sl.vec': 'b26997c0ed1de26a47b11efdc26ac1e7f189fa54', + 'wiki.sm.vec': '88c2c57ca483626b052403418cb4372d72352bc9', + 'wiki.sn.vec': '8dbb1019dcc8f842a8c0f550295ae697f8e1b7e0', + 'wiki.so.vec': '294756b60b03fe57cb08abd8d677d6a717b40bc8', + 'wiki.sq.vec': 'd07ffed553f5eb4756d0a1548a7ba9a51a52f7c6', + 'wiki.srn.vec': 'faee05e550f5b08809a9ae5586ac4b08c9a1c359', + 'wiki.sr.vec': '3cf09f476f55a92fdd2880f7ba336656ab232736', + 'wiki.ss.vec': '488546a3b2f88f549c50ae9f32f1997cc441b039', + 'wiki.stq.vec': '1bf88af29f1d86cac16042a5bea6b1651c96a8c1', + 'wiki.st.vec': '963646055d12873b1c83b0eef8649ecaf473d42e', + 'wiki.su.vec': '25e864495acb6d280bab0e62480f68550c9ceed4', + 'wiki.sv.vec': 'eab83ae36701139696477b91b6e8d292ef175053', + 'wiki.sw.vec': '8e70d207dbbd14e60a48e260a23fbf284a8e9f06', + 'wiki.szl.vec': '0573cf888ec70b459b0596d34814fe60fd69f190', + 'wiki.ta.vec': 'b66b5358527b1f3a6a421ab26464a3c1e75e18af', + 'wiki.tcy.vec': '388b1d89642fcc790b688e9643b3d19e14d66f40', + 'wiki.tet.vec': 'f38fe0e76b9b08ff652689eeee42c4fdadd9a47e', + 'wiki.te.vec': 'e71dcf3cc45da1bcdae5e431324025bd2026d0c8', + 'wiki.tg.vec': '6a5cd5bfe571ca0359b66d21bf6950553213f42d', + 'wiki.th.vec': '1d6e0d525392a1042d017534f6c320c5a0afd345', + 'wiki.ti.vec': 'c769fbc99bbb4138a40231e573685c7948d4a4c4', + 'wiki.tk.vec': '33ae577f77d339ab7a0dff88855b8d5c974d0aef', + 'wiki.tl.vec': 'd508e229ced7201510999e76d583de3ff2339d8b', + 'wiki.tn.vec': '39f45f3fa86645bb25c54150204abcd51cc1048c', + 'wiki.to.vec': '64d512665b55e9ef9a3915e8167347be79310fa0', + 'wiki.tpi.vec': '407b96d235f54f3e0be9dc23a3bab89c6593a621', + 'wiki.tr.vec': '13234aa1bf5f99e81d933482b3b83c3e4bf6c85e', + 'wiki.ts.vec': '00f8229e2f230afd388221c0f823a1de9fc0e443', + 'wiki.tt.vec': '913bb3a11da6f8142b3bbec3ef065162d9350f1d', + 'wiki.tum.vec': 'bfbe43364724af882a520d2edcc2ce049c7357cd', + 'wiki.tw.vec': 'f329b667d70d9f0b753e55e1b1579b5a5191d3bd', + 'wiki.ty.vec': 'b881f60b8c75a71864d9847a17961d368f3058fc', + 'wiki.tyv.vec': 'e8f9a36dc58e4108c553f96e247a877a099ab5ba', + 'wiki.udm.vec': '336a8526f22e177faac69573661dc9c3ce36591f', + 'wiki.ug.vec': '586d2febafaf17c9187c599ffd7b96e559103c34', + 'wiki.uk.vec': '77f7737b9f88eac2b3e130ea8abb8886336fd0c6', + 'wiki.ur.vec': 'cb8132102152a958df72bd3e25f1a72abb4c9c76', + 'wiki.uz.vec': '11c3a76dae12b454f693811e33ae2e60015743e2', + 'wiki.vec.vec': 'ae4b055fba21974e56beecab3a95f9dc24a62fd0', + 'wiki.vep.vec': 'a38a781fde24f4d7b52aa8bc450b9949dd4e1808', + 'wiki.ve.vec': 'b7d2947501de1c30a9f8496d5efae20c051104e1', + 'wiki.vi.vec': 'bc84245b52b2e212e28dc6856c0693ce9845a9c5', + 'wiki.vls.vec': '07e8636908c057b9870ce4b98c7130d460cf882a', + 'wiki.vo.vec': 'c830988b6965bfce2f932b1be193f7d1f755f411', + 'wiki.war.vec': '1f5d443d6f612b59a53820dd6f39fd886a6ad30f', + 'wiki.wa.vec': '18f9ca1a585e1d18c3630029141a2e19d7d34a8e', + 'wiki.wo.vec': '2ad96a7a9e640bc0dbcf316b1f414b92802dcb8e', + 'wiki.wuu.vec': 'e1cbae1d3ad52329d0f36ada764016fbacf07049', + 'wiki.xal.vec': 'b738222d84cb8c8fdb2b30a7219aa5d3bdc2f61c', + 'wiki.xh.vec': 'bf37f741b0b75953281d11df2b4d80100df9e666', + 'wiki.xmf.vec': 'dc1923cfd1a7002d5d60426b60e6756854ab4a14', + 'wiki.yi.vec': '299d61958b7dcc38774768f1489121384726d860', + 'wiki.yo.vec': 'e35c8aff2924ba07936be9d0d94bd298f09702a4', + 'wiki.za.vec': 'e3a0e58bd2e5b1891c71f1f7e37ff71997a20361', + 'wiki.zea.vec': 'ee12db26aab3f2b3b2745a298ef414e7aeb5a058', + 'wiki.zh_classical.vec': '840981c83dd8e5cb02d1cd695e2fe0870941316c', + 'wiki.zh_min_nan.vec': 'f91ccb013e200bb7ed560082ddf4bdd9c2f315bb', + 'wiki.zh.vec': '117ab34faa80e381641fbabf3a24bc8cfba44050', + 'wiki.zh_yue.vec': 'd2ac1ab9eb1a908797644f83f259c90cb3c1a350', + 'wiki.zu.vec': '4b244b9697a8280e6646842c5fc81bb3a6bc8ec7'} diff --git a/python/mxnet/gluon/text/embedding.py b/python/mxnet/gluon/text/embedding.py new file mode 100644 index 000000000000..1839212ee825 --- /dev/null +++ b/python/mxnet/gluon/text/embedding.py @@ -0,0 +1,581 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=consider-iterating-dictionary + +"""Text token embedding.""" +from __future__ import absolute_import +from __future__ import print_function + +import io +import logging +import os +import tarfile +import warnings +import zipfile + +from . import _constants as C +from ... import nd +from ... import registry +from ..utils import check_sha1, download, _get_repo_file_url + + +def register(embedding_cls): + """Registers a new token embedding. + + + Once an embedding is registered, we can create an instance of this embedding with + :func:`~mxnet.gluon.text.embedding.create`. + + + Examples + -------- + >>> @mxnet.gluon.text.embedding.register + ... class MyTextEmbed(mxnet.gluon.text.embedding.TokenEmbedding): + ... def __init__(self, file_name='my_pretrain_file'): + ... pass + >>> embed = mxnet.gluon.text.embedding.create('MyTokenEmbed') + >>> print(type(embed)) + + """ + + register_text_embedding = registry.get_register_func(TokenEmbedding, 'token embedding') + return register_text_embedding(embedding_cls) + + +def create(embedding_name, **kwargs): + """Creates an instance of token embedding. + + + Creates a token embedding instance by loading embedding vectors from an externally hosted + pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid + `embedding_name` and `file_name`, use `mxnet.gluon.text.embedding.get_file_names()`. + + + Parameters + ---------- + embedding_name : str + The token embedding name (case-insensitive). + + + Returns + ------- + An instance of `mxnet.gluon.text.embedding.TokenEmbedding`: + A token embedding instance that loads embedding vectors from an externally hosted + pre-trained token embedding file. + """ + + create_text_embedding = registry.get_create_func(TokenEmbedding, 'token embedding') + return create_text_embedding(embedding_name, **kwargs) + + +def get_file_names(embedding_name=None): + """Get valid token embedding names and their pre-trained file names. + + + To load token embedding vectors from an externally hosted pre-trained token embedding file, + such as those of GloVe and FastText, one should use + `mxnet.gluon.text.embedding.create(embedding_name, file_name)`. This method returns all the + valid names of `file_name` for the specified `embedding_name`. If `embedding_name` is set to + None, this method returns all the valid names of `embedding_name` with their associated + `file_name`. + + + Parameters + ---------- + embedding_name : str or None, default None + The pre-trained token embedding name. + + + Returns + ------- + dict or list: + A list of all the valid pre-trained token embedding file names (`file_name`) for the + specified token embedding name (`embedding_name`). If the text embeding name is set to None, + returns a dict mapping each valid token embedding name to a list of valid pre-trained files + (`file_name`). They can be plugged into + `mxnet.gluon.text.embedding.create(embedding_name, file_name)`. + """ + + text_embedding_reg = registry.get_registry(TokenEmbedding) + + if embedding_name is not None: + if embedding_name not in text_embedding_reg: + raise KeyError('Cannot find `embedding_name` %s. Use ' + '`get_file_names(embedding_name=None).keys()` to get all the valid' + 'embedding names.' % embedding_name) + return list(text_embedding_reg[embedding_name].pretrained_file_name_sha1.keys()) + else: + return {embedding_name: list(embedding_cls.pretrained_file_name_sha1.keys()) + for embedding_name, embedding_cls in registry.get_registry(TokenEmbedding).items()} + + +class TokenEmbedding(object): + """Token embedding base class. + + + To load token embedding from an externally hosted pre-trained token embedding file, such as + those of GloVe and FastText, use :func:`~mxnet.gluon.text.embedding.create(embedding_name, + file_name)`. To get all the available `embedding_name` and `file_name`, use + :func:`~mxnet.gluon.text.embedding.get_file_names()`. + + Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use + :func:`~mxnet.gluon.text.embedding.from_file()`. + + For every unknown token, if its representation `self.unknown_token` is encountered in the + pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token + embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the + token embedding vector initialized by `init_unknown_vec`. + + If a token is encountered multiple times in the pre-trained token embedding file, only the + first-encountered token embedding vector will be loaded and the rest will be skipped. + + + Parameters + ---------- + unknown_token : hashable object, default '' + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + + + Properties + ---------- + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + """ + + def __init__(self, unknown_token=''): + self._unknown_token = unknown_token + self._idx_to_token = [unknown_token] + self._token_to_idx = {token: idx for idx, token in enumerate(self._idx_to_token)} + self._idx_to_vec = None + + @classmethod + def _get_download_file_name(cls, file_name): + return file_name + + @classmethod + def _get_pretrained_file_url(cls, pretrained_file_name): + cls_name = cls.__name__.lower() + + namespace = 'gluon/embeddings/{}'.format(cls_name) + return _get_repo_file_url(namespace, cls._get_download_file_name(pretrained_file_name)) + + @classmethod + def _get_pretrained_file(cls, embedding_root, pretrained_file_name): + cls_name = cls.__name__.lower() + embedding_root = os.path.expanduser(embedding_root) + url = cls._get_pretrained_file_url(pretrained_file_name) + + embedding_dir = os.path.join(embedding_root, cls_name) + pretrained_file_path = os.path.join(embedding_dir, pretrained_file_name) + downloaded_file = os.path.basename(url) + downloaded_file_path = os.path.join(embedding_dir, downloaded_file) + + expected_file_hash = cls.pretrained_file_name_sha1[pretrained_file_name] + + if hasattr(cls, 'pretrained_archive_name_sha1'): + expected_downloaded_hash = \ + cls.pretrained_archive_name_sha1[downloaded_file] + else: + expected_downloaded_hash = expected_file_hash + + if not os.path.exists(pretrained_file_path) \ + or not check_sha1(pretrained_file_path, expected_file_hash): + download(url, downloaded_file_path, sha1_hash=expected_downloaded_hash) + + ext = os.path.splitext(downloaded_file)[1] + if ext == '.zip': + with zipfile.ZipFile(downloaded_file_path, 'r') as zf: + zf.extractall(embedding_dir) + elif ext == '.gz': + with tarfile.open(downloaded_file_path, 'r:gz') as tar: + tar.extractall(path=embedding_dir) + return pretrained_file_path + + def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, encoding='utf8'): + """Load embedding vectors from a pre-trained token embedding file. + + + For every unknown token, if its representation `self.unknown_token` is encountered in the + pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token + embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the + text embedding vector initialized by `init_unknown_vec`. + + If a token is encountered multiple times in the pre-trained text embedding file, only the + first-encountered token embedding vector will be loaded and the rest will be skipped. + """ + + pretrained_file_path = os.path.expanduser(pretrained_file_path) + + if not os.path.isfile(pretrained_file_path): + raise ValueError('`pretrained_file_path` must be a valid path to the pre-trained ' + 'token embedding file.') + + logging.info('Loading pre-trained token embedding vectors from %s', pretrained_file_path) + vec_len = None + all_elems = [] + tokens = set() + loaded_unknown_vec = None + line_num = 0 + with io.open(pretrained_file_path, 'r', encoding=encoding) as f: + for line in f: + line_num += 1 + elems = line.rstrip().split(elem_delim) + + assert len(elems) > 1, 'At line %d of the pre-trained token embedding file: the ' \ + 'data format of the pre-trained token embedding file %s ' \ + 'is unexpected.' % (line_num, pretrained_file_path) + + token, elems = elems[0], [float(i) for i in elems[1:]] + + if token == self.unknown_token and loaded_unknown_vec is None: + loaded_unknown_vec = elems + tokens.add(self.unknown_token) + elif token in tokens: + warnings.warn('At line %d of the pre-trained token embedding file: the ' + 'embedding vector for token %s has been loaded and a duplicate ' + 'embedding for the same token is seen and skipped.' % + (line_num, token)) + elif len(elems) == 1: + warnings.warn('At line %d of the pre-trained token embedding file: token %s ' + 'with 1-dimensional vector %s is likely a header and is ' + 'skipped.' % (line_num, token, elems)) + else: + if vec_len is None: + vec_len = len(elems) + # Reserve a vector slot for the unknown token at the very beggining because + # the unknown token index is 0. + all_elems.extend([0] * vec_len) + else: + assert len(elems) == vec_len, \ + 'At line %d of the pre-trained token embedding file: the dimension ' \ + 'of token %s is %d but the dimension of previous tokens is %d. ' \ + 'Dimensions of all the tokens must be the same.' \ + % (line_num, token, len(elems), vec_len) + all_elems.extend(elems) + self._idx_to_token.append(token) + self._token_to_idx[token] = len(self._idx_to_token) - 1 + tokens.add(token) + + self._idx_to_vec = nd.array(all_elems).reshape((-1, vec_len)) + + if loaded_unknown_vec is None: + self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec(shape=vec_len) + else: + self._idx_to_vec[C.UNKNOWN_IDX] = nd.array(loaded_unknown_vec) + + @property + def idx_to_vec(self): + return self._idx_to_vec + + @property + def unknown_token(self): + return self._unknown_token + + def __contains__(self, x): + return x in self._token_to_idx + + def __getitem__(self, tokens): + """Looks up embedding vectors of text tokens. + + + Parameters + ---------- + tokens : str or list of strs + A token or a list of tokens. + + + Returns + ------- + mxnet.ndarray.NDArray: + The embedding vector(s) of the token(s). According to numpy conventions, if `tokens` is + a string, returns a 1-D NDArray (vector); if `tokens` is a list of + strings, returns a 2-D NDArray (matrix) of shape=(len(tokens), vec_len). + """ + + to_reduce = not isinstance(tokens, (list, tuple)) + if to_reduce: + tokens = [tokens] + + indices = [self._token_to_idx.get(token, C.UNKNOWN_IDX) for token in tokens] + + vecs = nd.Embedding(nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0], + self.idx_to_vec.shape[1]) + + return vecs[0] if to_reduce else vecs + + def __setitem__(self, tokens, new_vectors): + """Updates embedding vectors for tokens. + + + Parameters + ---------- + tokens : str or a list of strs + A token or a list of tokens whose embedding vector are to be updated. + new_vectors : mxnet.ndarray.NDArray + An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal + to the number of `tokens` and its width must be equal to the dimension of embedding of + the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list + of multiple strings, it must be 2-D. + """ + + assert self._idx_to_vec is not None, '`idx_to_vec` has not been initialized.' + + if not isinstance(tokens, list) or len(tokens) == 1: + assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) in [1, 2], \ + '`new_vectors` must be a 1-D or 2-D NDArray if `tokens` is a singleton.' + if not isinstance(tokens, list): + tokens = [tokens] + if len(new_vectors.shape) == 1: + new_vectors = new_vectors.expand_dims(0) + + else: + assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) == 2, \ + '`new_vectors` must be a 2-D NDArray if `tokens` is a list of multiple strings.' + assert new_vectors.shape == (len(tokens), self._idx_to_vec.shape[1]), \ + 'The length of new_vectors must be equal to the number of tokens and the width of' \ + 'new_vectors must be equal to the dimension of embedding of the glossary.' + + indices = [] + for token in tokens: + if token in self._token_to_idx: + indices.append(self._token_to_idx[token]) + else: + raise ValueError('Token %s is unknown. To update the embedding vector for an ' + 'unknown token, please specify it explicitly as the ' + '`unknown_token` %s in `tokens`. This is to avoid unintended ' + 'updates.' % (token, self._idx_to_token[C.UNKNOWN_IDX])) + + self._idx_to_vec[nd.array(indices)] = new_vectors + + @classmethod + def _check_pretrained_file_names(cls, file_name): + """Checks if a pre-trained token embedding file name is valid. + + + Parameters + ---------- + file_name : str + The pre-trained token embedding file. + """ + + embedding_name = cls.__name__.lower() + if file_name not in cls.pretrained_file_name_sha1: + raise KeyError('Cannot find pre-trained file %s for token embedding %s. Valid ' + 'pre-trained file names for embedding %s: %s' % + (file_name, embedding_name, embedding_name, + ', '.join(cls.pretrained_file_name_sha1.keys()))) + + @staticmethod + def from_file(file_path, elem_delim=' ', encoding='utf8', init_unknown_vec=nd.zeros, **kwargs): + """Creates a user-defined token embedding from a pre-trained embedding file. + + + This is to load embedding vectors from a user-defined pre-trained token embedding file. + For example, if `elem_delim` = ' ', the expected format of a custom pre-trained token + embedding file may look like: + + 'hello 0.1 0.2 0.3 0.4 0.5\\\\nworld 1.1 1.2 1.3 1.4 1.5\\\\n' + + where embedding vectors of words `hello` and `world` are [0.1, 0.2, 0.3, 0.4, 0.5] and + [1.1, 1.2, 1.3, 1.4, 1.5] respectively. + + + Parameters + ---------- + file_path : str + The path to the user-defined pre-trained token embedding file. + elem_delim : str, default ' ' + The delimiter for splitting a token and every embedding vector element value on the same + line of the custom pre-trained token embedding file. + encoding : str, default 'utf8' + The encoding scheme for reading the custom pre-trained token embedding file. + init_unknown_vec : callback + The callback used to initialize the embedding vector for the unknown token. + + + Returns + ------- + instance of `~mxnet.gluon.text.embedding.TokenEmbedding` + The user-defined token embedding instance. + """ + embedding = TokenEmbedding(**kwargs) + embedding._load_embedding(file_path, elem_delim, init_unknown_vec, encoding) + + return embedding + + +@register +class GloVe(TokenEmbedding): + """The GloVe word embedding. + + + GloVe is an unsupervised learning algorithm for obtaining vector representations for words. + Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and + the resulting representations showcase interesting linear substructures of the word vector + space. (Source from https://nlp.stanford.edu/projects/glove/) + + Reference: + + GloVe: Global Vectors for Word Representation. + Jeffrey Pennington, Richard Socher, and Christopher D. Manning. + https://nlp.stanford.edu/pubs/glove.pdf + + Website: + + https://nlp.stanford.edu/projects/glove/ + + To get the updated URLs to the externally hosted pre-trained token embedding + files, visit https://nlp.stanford.edu/projects/glove/ + + License for pre-trained embedding: + + https://opendatacommons.org/licenses/pddl/ + + + Parameters + ---------- + file_name : str, default 'glove.6B.50d.txt' + The name of the pre-trained token embedding file. + embedding_root : str, default os.path.join('~', '.mxnet', 'embedding') + The root directory for storing embedding-related files. + init_unknown_vec : callback + The callback used to initialize the embedding vector for the unknown token. + + + Properties + ---------- + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + """ + + # Map a pre-trained token embedding archive file and its SHA-1 hash. + pretrained_archive_name_sha1 = C.GLOVE_PRETRAINED_FILE_SHA1 + + # Map a pre-trained token embedding file and its SHA-1 hash. + pretrained_file_name_sha1 = C.GLOVE_PRETRAINED_ARCHIVE_SHA1 + + @classmethod + def _get_download_file_name(cls, file_name): + # Map a pre-trained embedding file to its archive to download. + src_archive = {archive.split('.')[1]: archive for archive in + GloVe.pretrained_archive_name_sha1.keys()} + archive = src_archive[file_name.split('.')[1]] + return archive + + def __init__(self, file_name='glove.6B.50d.txt', + embedding_root=os.path.join('~', '.mxnet', 'embedding'), + init_unknown_vec=nd.zeros, **kwargs): + GloVe._check_pretrained_file_names(file_name) + + super(GloVe, self).__init__(**kwargs) + pretrained_file_path = GloVe._get_pretrained_file(embedding_root, file_name) + + self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) + + +@register +class FastText(TokenEmbedding): + """The fastText word embedding. + + + FastText is an open-source, free, lightweight library that allows users to learn text + representations and text classifiers. It works on standard, generic hardware. Models can later + be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/) + + + References: + + Enriching Word Vectors with Subword Information. + Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. + https://arxiv.org/abs/1607.04606 + + Bag of Tricks for Efficient Text Classification. + Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov. + https://arxiv.org/abs/1607.01759 + + FastText.zip: Compressing text classification models. + Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, and Tomas Mikolov. + https://arxiv.org/abs/1612.03651 + + For 'wiki.multi' embedding: + Word Translation Without Parallel Data + Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou. + https://arxiv.org/abs/1710.04087 + + Website: + + https://fasttext.cc/ + + To get the updated URLs to the externally hosted pre-trained token embedding files, visit + https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md + + License for pre-trained embedding: + + https://creativecommons.org/licenses/by-sa/3.0/ + + + Parameters + ---------- + file_name : str, default 'glove.6B.50d.txt' + The name of the pre-trained token embedding file. + embedding_root : str, default os.path.join('~', '.mxnet', 'embedding') + The root directory for storing embedding-related files. + init_unknown_vec : callback + The callback used to initialize the embedding vector for the unknown token. + + + Properties + ---------- + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + """ + + # Map a pre-trained token embedding archive file and its SHA-1 hash. + pretrained_archive_name_sha1 = C.FAST_TEXT_ARCHIVE_SHA1 + + # Map a pre-trained token embedding file and its SHA-1 hash. + pretrained_file_name_sha1 = C.FAST_TEXT_FILE_SHA1 + + @classmethod + def _get_download_file_name(cls, file_name): + # Map a pre-trained embedding file to its archive to download. + return '.'.join(file_name.split('.')[:-1]) + '.zip' + + def __init__(self, file_name='wiki.simple.vec', + embedding_root=os.path.join('~', '.mxnet', 'embedding'), + init_unknown_vec=nd.zeros, **kwargs): + FastText._check_pretrained_file_names(file_name) + + super(FastText, self).__init__(**kwargs) + pretrained_file_path = FastText._get_pretrained_file(embedding_root, file_name) + + self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) diff --git a/python/mxnet/gluon/text/utils.py b/python/mxnet/gluon/text/utils.py new file mode 100644 index 000000000000..a812be8c2df2 --- /dev/null +++ b/python/mxnet/gluon/text/utils.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 + +"""Utilities for text data processing.""" +from __future__ import absolute_import +from __future__ import print_function + +import collections +import re + + +def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n', to_lower=False, + counter_to_update=None): + """Counts tokens in the specified string. + + + For token_delim='(td)' and seq_delim='(sd)', a specified string of two sequences of tokens may + look like:: + + (td)token1(td)token2(td)token3(td)(sd)(td)token4(td)token5(td)(sd) + + + Parameters + ---------- + source_str : str + A source string of tokens. + token_delim : str, default ' ' + A token delimiter. + seq_delim : str, default '\\\\n' + A sequence delimiter. + to_lower : bool, default False + Whether to convert the source source_str to the lower case. + counter_to_update : collections.Counter or None, default None + The collections.Counter instance to be updated with the token counts of `source_str`. If + None, return a new collections.Counter instance counting tokens from `source_str`. + + + Returns + ------- + collections.Counter + The `counter_to_update` collections.Counter instance after being updated with the token + counts of `source_str`. If `counter_to_update` is None, return a new collections.Counter + instance counting tokens from `source_str`. + + + Examples + -------- + >>> source_str = ' Life is great ! \\n life is good . \\n' + >>> count_tokens_from_str(token_line, ' ', '\\n', True) + Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2}) + """ + + source_str = filter(None, re.split(token_delim + '|' + seq_delim, source_str)) + if to_lower: + source_str = [t.lower() for t in source_str] + + if counter_to_update is None: + return collections.Counter(source_str) + else: + counter_to_update.update(source_str) + return counter_to_update diff --git a/python/mxnet/gluon/text/vocab.py b/python/mxnet/gluon/text/vocab.py new file mode 100644 index 000000000000..aa962af24285 --- /dev/null +++ b/python/mxnet/gluon/text/vocab.py @@ -0,0 +1,323 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=consider-iterating-dictionary + +"""Vocabulary.""" +from __future__ import absolute_import +from __future__ import print_function + +import collections +from ... import nd + +from . import _constants as C +from . import embedding as ebd + + +class Vocabulary(object): + """Indexing and embedding attachment for text tokens. + + + Parameters + ---------- + counter : collections.Counter or None, default None + Counts text token frequencies in the text data. Its keys will be indexed according to + frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`, + `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. + Examples: str, int, and tuple. + max_size : None or int, default None + The maximum possible number of the most frequent tokens in the keys of `counter` that can be + indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose + that there are different keys of `counter` whose frequency are the same, if indexing all of + them will exceed this argument value, such keys will be indexed one by one according to + their __cmp__() order until the frequency threshold is met. If this argument is None or + larger than its largest possible value restricted by `counter` and `reserved_tokens`, this + argument has no effect. + min_freq : int, default 1 + The minimum frequency required for a token in the keys of `counter` to be indexed. + unknown_token : hashable object, default '' + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. Keys of `counter`, `unknown_token`, and values of + `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. + reserved_tokens : list of hashable objects or None, default None + A list of reserved tokens (excluding `unknown_token`) that will always be indexed, such as + special symbols representing padding, beginning of sentence, and end of sentence. It cannot + contain `unknown_token` or duplicate reserved tokens. Keys of `counter`, `unknown_token`, + and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and + tuple. + + + Properties + ---------- + embedding : instance of :class:`~mxnet.gluon.text.embedding.TokenEmbedding` + The embedding of the indexed tokens. + idx_to_token : list of strs + A list of indexed tokens where the list indices and the token indices are aligned. + reserved_tokens : list of strs or None + A list of reserved tokens that will always be indexed. + token_to_idx : dict mapping str to int + A dict mapping each token to its index integer. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + + + Examples + -------- + + >>> text_data = " hello world \\\\n hello nice world \\\\n hi world \\\\n" + >>> counter = text.count_tokens_from_str(text_data) + >>> my_vocab = text.Vocabulary(counter) + >>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + >>> my_vocab.set_embedding(fasttext) + >>> my_vocab.embedding[['hello', 'world']] + [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + + >>> my_vocab[['hello', 'world']] + [2, 1] + + >>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape + >>> layer = gluon.nn.Embedding(input_dim, output_dim) + >>> layer.initialize() + >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec) + >>> layer(nd.array([2, 1])) + [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + + >>> glove = text.embedding.create('glove', file_name='glove.6B.50d.txt') + >>> my_vocab.set_embedding(glove) + >>> my_vocab.embedding[['hello', 'world']] + [[ -0.38497001 0.80092001 + ... + 0.048833 0.67203999] + [ -0.41486001 0.71847999 + ... + -0.37639001 -0.67541999]] + + + """ + + def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None): + + # Sanity checks. + assert min_freq > 0, '`min_freq` must be set to a positive value.' + + if reserved_tokens is not None: + reserved_token_set = set(reserved_tokens) + assert unknown_token not in reserved_token_set, \ + '`reserved_token` cannot contain `unknown_token`.' + assert len(reserved_token_set) == len(reserved_tokens), \ + '`reserved_tokens` cannot contain duplicate reserved tokens.' + + self._index_unknown_and_reserved_tokens(unknown_token, reserved_tokens) + + if counter is not None: + self._index_counter_keys(counter, unknown_token, reserved_tokens, max_size, min_freq) + + self._embedding = None + + def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens): + """Indexes unknown and reserved tokens.""" + + self._unknown_token = unknown_token + self._idx_to_token = [unknown_token] + + if reserved_tokens is None: + self._reserved_tokens = None + else: + self._reserved_tokens = reserved_tokens[:] + self._idx_to_token.extend(reserved_tokens) + + self._token_to_idx = {token: idx for idx, token in enumerate(self._idx_to_token)} + + def _index_counter_keys(self, counter, unknown_token, reserved_tokens, max_size, + min_freq): + """Indexes keys of `counter`. + + + Indexes keys of `counter` according to frequency thresholds such as `max_size` and + `min_freq`. + """ + + assert isinstance(counter, collections.Counter), \ + '`counter` must be an instance of collections.Counter.' + + unknown_and_reserved_tokens = set(reserved_tokens) if reserved_tokens is not None else set() + unknown_and_reserved_tokens.add(unknown_token) + + token_freqs = sorted(counter.items(), key=lambda x: x[0]) + token_freqs.sort(key=lambda x: x[1], reverse=True) + + token_cap = len(unknown_and_reserved_tokens) + ( + len(counter) if max_size is None else max_size) + + for token, freq in token_freqs: + if freq < min_freq or len(self._idx_to_token) == token_cap: + break + if token not in unknown_and_reserved_tokens: + self._idx_to_token.append(token) + self._token_to_idx[token] = len(self._idx_to_token) - 1 + + @property + def embedding(self): + return self._embedding + + @property + def idx_to_token(self): + return self._idx_to_token + + @property + def reserved_tokens(self): + return self._reserved_tokens + + @property + def token_to_idx(self): + return self._token_to_idx + + @property + def unknown_token(self): + return self._unknown_token + + def __contains__(self, token): + """Checks whether a text token exists in the vocabulary. + + + Parameters + ---------- + token : str + A text token. + + + Returns + ------- + bool + Whether the text token exists in the vocabulary (including `unknown_token`). + """ + + return token in self._token_to_idx + + def __getitem__(self, tokens): + """Looks up indices of text tokens according to the vocabulary. + + + Parameters + ---------- + tokens : str or list of strs + A source token or tokens to be converted. + + + Returns + ------- + int or list of ints + A token index or a list of token indices according to the vocabulary. + """ + + if not isinstance(tokens, (list, tuple)): + return self._token_to_idx[tokens] if tokens in self._token_to_idx \ + else C.UNKNOWN_IDX + else: + return [self._token_to_idx[token] if token in self._token_to_idx + else C.UNKNOWN_IDX for token in tokens] + + def __len__(self): + return len(self._idx_to_token) + + def set_embedding(self, *embeddings): + """Attaches embeddings to the indexed text tokens. + + + Parameters + ---------- + embeddings : None or tuple of :class:`~mxnet.gluon.text.embedding.TokenEmbedding` instances + The embedding to be attached to the indexed tokens. If a tuple of multiple embeddings + are provided, their embedding vectors will be concatenated for the same token. + """ + + if len(embeddings) == 1 and embeddings[0] is None: + self._embedding = None + return + + for embedding in embeddings: + assert isinstance(embedding, ebd.TokenEmbedding), \ + 'The argument `embeddings` must be an instance or a list of instances of ' \ + '`mxnet.gluon.text.embedding.TokenEmbedding`.' + + new_embedding = ebd.TokenEmbedding(self.unknown_token) + new_embedding._token_to_idx = self.token_to_idx + new_embedding._idx_to_token = self.idx_to_token + + new_vec_len = sum(embedding.idx_to_vec.shape[1] for embedding in embeddings + if embedding and embedding.idx_to_vec is not None) + new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len)) + + col_start = 0 + # Concatenate all the embedding vectors in embedding. + for embedding in embeddings: + if embedding and embedding.idx_to_vec is not None: + col_end = col_start + embedding.idx_to_vec.shape[1] + # Cancatenate vectors of the unknown token. + new_idx_to_vec[0, col_start:col_end] = embedding[0] + new_idx_to_vec[1:, col_start:col_end] = embedding[self._idx_to_token[1:]] + col_start = col_end + + new_embedding._idx_to_vec = new_idx_to_vec + self._embedding = new_embedding + + def to_tokens(self, indices): + """Converts token indices to tokens according to the vocabulary. + + + Parameters + ---------- + indices : int or list of ints + A source token index or token indices to be converted. + + + Returns + ------- + str or list of strs + A token or a list of tokens according to the vocabulary. + """ + + to_reduce = False + if not isinstance(indices, (list, tuple)): + indices = [indices] + to_reduce = True + + max_idx = len(self._idx_to_token) - 1 + + tokens = [] + for idx in indices: + if not isinstance(idx, int) or idx > max_idx: + raise ValueError('Token index %d in the provided `indices` is invalid.' % idx) + else: + tokens.append(self._idx_to_token[idx]) + + return tokens[0] if to_reduce else tokens diff --git a/tests/python/unittest/test_gluon_text.py b/tests/python/unittest/test_gluon_text.py new file mode 100644 index 000000000000..ad4f4a036e09 --- /dev/null +++ b/tests/python/unittest/test_gluon_text.py @@ -0,0 +1,681 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# 'License'); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 + +from __future__ import absolute_import +from __future__ import print_function + +from collections import Counter + +from common import assertRaises +from mxnet import ndarray as nd +from mxnet.gluon import text +from mxnet.test_utils import * + + +def _get_test_str_of_tokens(token_delim, seq_delim): + seq1 = token_delim + token_delim.join(['Life', 'is', 'great', '!']) + token_delim + seq_delim + seq2 = token_delim + token_delim.join(['life', 'is', 'good', '.']) + token_delim + seq_delim + seq3 = token_delim + token_delim.join(['life', "isn't", 'bad', '.']) + token_delim + seq_delim + seqs = seq1 + seq2 + seq3 + return seqs + + +def _test_count_tokens_from_str_with_delims(token_delim, seq_delim): + source_str = _get_test_str_of_tokens(token_delim, seq_delim) + + cnt1 = text.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=False) + assert cnt1 == Counter( + {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1, + 'bad': 1}) + + cnt2 = text.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=True) + assert cnt2 == Counter( + {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) + + counter_to_update = Counter({'life': 2}) + + cnt3 = text.utils.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=False, + counter_to_update=counter_to_update.copy()) + assert cnt3 == Counter( + {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1, + 'bad': 1}) + + cnt4 = text.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=True, + counter_to_update=counter_to_update.copy()) + assert cnt4 == Counter( + {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) + + +def test_count_tokens_from_str(): + _test_count_tokens_from_str_with_delims(' ', '\n') + _test_count_tokens_from_str_with_delims('IS', 'LIFE') + + +def test_vocabulary_getitem(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + vocab = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + + i1 = vocab['c'] + assert i1 == 1 + + i2 = vocab[['c']] + assert i2 == [1] + + i3 = vocab[['', 'non-exist']] + assert i3 == [0, 0] + + i4 = vocab[['a', 'non-exist', 'a', 'b']] + assert i4 == [3, 0, 3, 2] + + +def test_vocabulary_to_tokens(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + vocab = text.Vocabulary(counter, max_size=None, min_freq=1,unknown_token='', + reserved_tokens=None) + i1 = vocab.to_tokens(1) + assert i1 == 'c' + + i2 = vocab.to_tokens([1]) + assert i2 == ['c'] + + i3 = vocab.to_tokens([0, 0]) + assert i3 == ['', ''] + + i4 = vocab.to_tokens([3, 0, 3, 2]) + assert i4 == ['a', '', 'a', 'b'] + + assertRaises(ValueError, vocab.to_tokens, 5) + assertRaises(ValueError, vocab.to_tokens, [5, 6]) + + +def test_vocabulary(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v1) == 5 + assert v1.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4} + assert v1.idx_to_token[1] == 'c' + assert v1.unknown_token == '' + assert v1.reserved_tokens is None + assert v1.embedding is None + assert 'a' in v1 + assert v1.unknown_token in v1 + + v2 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', + reserved_tokens=None) + assert len(v2) == 3 + assert v2.token_to_idx == {'': 0, 'c': 1, 'b': 2} + assert v2.idx_to_token[1] == 'c' + assert v2.unknown_token == '' + assert v2.reserved_tokens is None + assert v2.embedding is None + assert 'a' not in v2 + assert v2.unknown_token in v2 + + v3 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', + reserved_tokens=None) + assert len(v3) == 1 + assert v3.token_to_idx == {'': 0} + assert v3.idx_to_token[0] == '' + assert v3.unknown_token == '' + assert v3.reserved_tokens is None + assert v3.embedding is None + assert 'a' not in v3 + + v4 = text.Vocabulary(counter, max_size=2, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v4) == 3 + assert v4.token_to_idx == {'': 0, 'c': 1, 'b': 2} + assert v4.idx_to_token[1] == 'c' + assert v4.unknown_token == '' + assert v4.reserved_tokens is None + assert v4.embedding is None + assert 'a' not in v4 + + v5 = text.Vocabulary(counter, max_size=3, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v5) == 4 + assert v5.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3} + assert v5.idx_to_token[1] == 'c' + assert v5.unknown_token == '' + assert v5.reserved_tokens is None + assert v5.embedding is None + assert 'a' in v5 + + v6 = text.Vocabulary(counter, max_size=100, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v6) == 5 + assert v6.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, + 'some_word$': 4} + assert v6.idx_to_token[1] == 'c' + assert v6.unknown_token == '' + assert v6.reserved_tokens is None + assert v6.embedding is None + assert 'a' in v6 + + v7 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', + reserved_tokens=None) + assert len(v7) == 2 + assert v7.token_to_idx == {'': 0, 'c': 1} + assert v7.idx_to_token[1] == 'c' + assert v7.unknown_token == '' + assert v7.reserved_tokens is None + assert v7.embedding is None + assert 'a' not in v7 + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=0, unknown_token='', reserved_tokens=['b']) + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=1, unknown_token='', reserved_tokens=['b', 'b']) + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=1, unknown_token='', reserved_tokens=['b', '']) + + v8 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=['b']) + assert len(v8) == 5 + assert v8.token_to_idx == {'': 0, 'b': 1, 'c': 2, 'a': 3, 'some_word$': 4} + assert v8.idx_to_token[1] == 'b' + assert v8.unknown_token == '' + assert v8.reserved_tokens == ['b'] + assert v8.embedding is None + assert 'a' in v8 + + v9 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', + reserved_tokens=['b', 'a']) + assert len(v9) == 4 + assert v9.token_to_idx == {'': 0, 'b': 1, 'a': 2, 'c': 3} + assert v9.idx_to_token[1] == 'b' + assert v9.unknown_token == '' + assert v9.reserved_tokens == ['b', 'a'] + assert v9.embedding is None + assert 'a' in v9 + + v10 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', + reserved_tokens=['b', 'c']) + assert len(v10) == 3 + assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2} + assert v10.idx_to_token[1] == 'b' + assert v10.unknown_token == '' + assert v10.reserved_tokens == ['b', 'c'] + assert v10.embedding is None + assert 'a' not in v10 + + v11 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', + reserved_tokens=['', 'b']) + assert len(v11) == 4 + assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3} + assert v11.idx_to_token[1] == '' + assert v11.unknown_token == '' + assert v11.reserved_tokens == ['', 'b'] + assert v11.embedding is None + assert 'a' not in v11 + + v12 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='b', + reserved_tokens=['']) + assert len(v12) == 3 + assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2} + assert v12.idx_to_token[1] == '' + assert v12.unknown_token == 'b' + assert v12.reserved_tokens == [''] + assert v12.embedding is None + assert 'a' not in v12 + + v13 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='a', + reserved_tokens=['']) + assert len(v13) == 4 + assert v13.token_to_idx == {'a': 0, '': 1, 'c': 2, 'b': 3} + assert v13.idx_to_token[1] == '' + assert v13.unknown_token == 'a' + assert v13.reserved_tokens == [''] + assert v13.embedding is None + assert 'a' in v13 + + counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), ('c', 'c'), ('c', 'c'), ('c', 'c'), + ('some_word$', 'some_word$')]) + + v14 = text.Vocabulary(counter_tuple, max_size=None, min_freq=1, + unknown_token=('', ''), reserved_tokens=None) + assert len(v14) == 5 + assert v14.token_to_idx == {('', ''): 0, ('c', 'c'): 1, ('b', 'b'): 2, ('a', 'a'): 3, + ('some_word$', 'some_word$'): 4} + assert v14.idx_to_token[1] == ('c', 'c') + assert v14.unknown_token == ('', '') + assert v14.reserved_tokens is None + assert v14.embedding is None + assert ('a', 'a') in v14 + assert ('', '') in v14 + + +def _mk_my_pretrain_file(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seqs = seq1 + seq2 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_pretrain_file2(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n' + seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n' + seqs = seq1 + seq2 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_pretrain_file3(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seq3 = token_delim.join(['', '1.1', '1.2', '1.3', '1.4', + '1.5']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_pretrain_file4(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n' + seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n' + seq3 = token_delim.join(['', '0.11', '0.12', '0.13', '0.14', '0.15']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_invalid_pretrain_file(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seq3 = token_delim.join(['c']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_invalid_pretrain_file2(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seq3 = token_delim.join(['c', '0.6', '0.7', '0.8']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def test_token_embedding_from_file(): + embed_root = 'embedding' + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file = 'my_pretrain_file.txt' + + _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file) + + pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) + + my_embed = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim) + + assert 'a' in my_embed + assert my_embed.unknown_token == '' + assert my_embed.unknown_token in my_embed + + first_vec = my_embed.idx_to_vec[0] + assert_almost_equal(first_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) + + # Test __getitem__. + unk_vec = my_embed['A'] + assert_almost_equal(unk_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) + + a_vec = my_embed['a'] + assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5])) + + # Test __setitem__. + my_embed['a'] = nd.array([1, 2, 3, 4, 5]) + assert_almost_equal(my_embed['a'].asnumpy(), np.array([1, 2, 3, 4, 5])) + assertRaises(ValueError, my_embed.__setitem__, 'unknown$$$', nd.array([0, 0, 0, 0, 0])) + + assertRaises(AssertionError, my_embed.__setitem__, '', + nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) + + assertRaises(AssertionError, my_embed.__setitem__, '', nd.array([0])) + + unk_vecs = my_embed['', ''] + assert_almost_equal(unk_vecs.asnumpy(), np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) + + # Test loaded unknown vectors. + pretrain_file2 = 'my_pretrain_file2.txt' + _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2) + pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2) + my_embed2 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, + init_unknown_vec=nd.ones, + unknown_token='') + unk_vec2 = my_embed2[''] + assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) + unk_vec2 = my_embed2[''] + assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) + + my_embed3 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, + init_unknown_vec=nd.ones, + unknown_token='') + unk_vec3 = my_embed3[''] + assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) + unk_vec3 = my_embed3[''] + assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) + + # Test error handling. + invalid_pretrain_file = 'invalid_pretrain_file.txt' + _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, + invalid_pretrain_file) + pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file) + assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, + elem_delim) + + invalid_pretrain_file2 = 'invalid_pretrain_file2.txt' + _mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, + invalid_pretrain_file2) + pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file2) + assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, + elem_delim) + + +def test_embedding_get_and_pretrain_file_names(): + assert len(text.embedding.get_file_names(embedding_name='fasttext')) == 327 + assert len(text.embedding.get_file_names(embedding_name='glove')) == 10 + + reg = text.embedding.get_file_names(embedding_name=None) + + assert len(reg['glove']) == 10 + assert len(reg['fasttext']) == 327 + + assertRaises(KeyError, text.embedding.get_file_names, 'unknown$$') + + +def test_vocab_set_embedding_with_one_custom_embedding(): + embed_root = 'embedding' + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file = 'my_pretrain_file1.txt' + + _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file) + + pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) + + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=['']) + + e1 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, + init_unknown_vec=nd.ones) + + assert v1.embedding is None + v1.set_embedding(e1) + assert v1.embedding is not None + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [0.6, 0.7, 0.8, 0.9, 1], + [0.1, 0.2, 0.3, 0.4, 0.5], + [1, 1, 1, 1, 1]]) + ) + + assert_almost_equal(v1.embedding['c'].asnumpy(), + np.array([1, 1, 1, 1, 1]) + ) + + assert_almost_equal(v1.embedding[['c']].asnumpy(), + np.array([[1, 1, 1, 1, 1]]) + ) + + assert_almost_equal(v1.embedding[['a', 'not_exist']].asnumpy(), + np.array([[0.1, 0.2, 0.3, 0.4, 0.5], + [1, 1, 1, 1, 1]]) + ) + + assert_almost_equal(v1.embedding[['a', 'b']].asnumpy(), + np.array([[0.1, 0.2, 0.3, 0.4, 0.5], + [0.6, 0.7, 0.8, 0.9, 1]]) + ) + + assert_almost_equal(v1.embedding[['A', 'b']].asnumpy(), + np.array([[1, 1, 1, 1, 1], + [0.6, 0.7, 0.8, 0.9, 1]]) + ) + + v1.embedding['a'] = nd.array([2, 2, 2, 2, 2]) + v1.embedding['b'] = nd.array([3, 3, 3, 3, 3]) + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [3, 3, 3, 3, 3], + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1]]) + ) + + v1.embedding[''] = nd.array([0, 0, 0, 0, 0]) + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [3, 3, 3, 3, 3], + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1]]) + ) + v1.embedding[''] = nd.array([10, 10, 10, 10, 10]) + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[10, 10, 10, 10, 10], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [3, 3, 3, 3, 3], + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1]]) + ) + + v1.set_embedding(None) + assert v1.embedding is None + + +def test_vocab_set_embedding_with_two_custom_embeddings(): + embed_root = '.' + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file1 = 'my_pretrain_file1.txt' + pretrain_file2 = 'my_pretrain_file2.txt' + + _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file1) + _mk_my_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2) + + pretrain_file_path1 = os.path.join(embed_root, embed_name, pretrain_file1) + pretrain_file_path2 = os.path.join(embed_root, embed_name, pretrain_file2) + + my_embed1 = text.embedding.TokenEmbedding.from_file(pretrain_file_path1, elem_delim, + init_unknown_vec=nd.ones) + my_embed2 = text.embedding.TokenEmbedding.from_file(pretrain_file_path2, elem_delim) + + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v1.set_embedding(my_embed1, my_embed2) + assert v1.embedding is not None + + assertRaises(AssertionError, v1.set_embedding, my_embed1, None, my_embed2) + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) + ) + + assert_almost_equal(v1.embedding['c'].asnumpy(), + np.array([1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1]) + ) + + assert_almost_equal(v1.embedding[['b', 'not_exist']].asnumpy(), + np.array([[0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) + ) + + v1.embedding['a'] = nd.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) + v1.embedding['b'] = nd.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3]) + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], + [3, 3, 3, 3, 3, 3, 3, 3, 3, 3], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) + ) + + # Test loaded unknown tokens + pretrain_file3 = 'my_pretrain_file3.txt' + pretrain_file4 = 'my_pretrain_file4.txt' + + _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file3) + _mk_my_pretrain_file4(os.path.join(embed_root, embed_name), elem_delim, pretrain_file4) + + pretrain_file_path3 = os.path.join(embed_root, embed_name, pretrain_file3) + pretrain_file_path4 = os.path.join(embed_root, embed_name, pretrain_file4) + + my_embed3 = text.embedding.TokenEmbedding.from_file(pretrain_file_path3, elem_delim, + init_unknown_vec=nd.ones, unknown_token='') + my_embed4 = text.embedding.TokenEmbedding.from_file(pretrain_file_path4, elem_delim, + unknown_token='') + + v2 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v2.set_embedding(my_embed3, my_embed4) + assert_almost_equal(v2.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + v3 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v3.set_embedding(my_embed3, my_embed4) + assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + v4 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v4.set_embedding(my_embed3, my_embed4) + assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + counter2 = Counter(['b', 'b', 'c', 'c', 'c', 'some_word$']) + + v5 = text.Vocabulary(counter2, max_size=None, min_freq=1, unknown_token='a', + reserved_tokens=None) + v5.set_embedding(my_embed3, my_embed4) + assert v5.embedding._token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3} + assert v5.embedding._idx_to_token == ['a', 'c', 'b', 'some_word$'] + assert_almost_equal(v5.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + +def test_download_embed(): + @text.embedding.register + class Test(text.embedding.TokenEmbedding): + # 33 bytes. + pretrained_file_name_sha1 = \ + {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'} + namespace = 'test' + + def __init__(self, embedding_root='embedding', init_unknown_vec=nd.zeros, **kwargs): + file_name = 'embedding_test.vec' + Test._check_pretrained_file_names(file_name) + + super(Test, self).__init__(**kwargs) + + file_path = Test._get_pretrained_file(embedding_root, file_name) + + self._load_embedding(file_path, ' ', init_unknown_vec) + + test_embed = text.embedding.create('test') + assert_almost_equal(test_embed['hello'].asnumpy(), (nd.arange(5) + 1).asnumpy()) + assert_almost_equal(test_embed['world'].asnumpy(), (nd.arange(5) + 6).asnumpy()) + assert_almost_equal(test_embed[''].asnumpy(), nd.zeros((5,)).asnumpy()) + + +if __name__ == '__main__': + import nose + nose.runmodule()