From 4e2f8e9f689919b33f8951638a6d049879db58a4 Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Thu, 8 Mar 2018 15:44:29 -0800 Subject: [PATCH 01/20] [MXNET-67] Sync master with v1.1.0 branch (#10031) * [REVIEW REQUIRED] Revert PR #9484 & add additional dependency licenses to LICENSE file (#9701) * Revert "[Review Required] Fixing Licenses: Cleaning up the Top Level LICENSE file (#9484)" This reverts commit 8930d96b265560a797c5554a9617f607cea7740f. * Some more LICENSE fixes * Adding some more packages to the LICENSE file * Adding dependencies of dependencies * update v1.1.0 change log to NEWS.md * sync README.md from v1.1.0 branch * revert to correct jenkins url in README --- LICENSE | 312 ++++++++++++++++++++++++++++++++++++++++++++++++------ NEWS.md | 41 +++++++ README.md | 1 + 3 files changed, 320 insertions(+), 34 deletions(-) diff --git a/LICENSE b/LICENSE index d3b3d6f9dd0f..e7d50c377232 100644 --- a/LICENSE +++ b/LICENSE @@ -201,43 +201,145 @@ See the License for the specific language governing permissions and limitations under the License. - ======================================================================= + ====================================================================================== Apache MXNET (incubating) Subcomponents: - The Apache MXNET (incubating) project contains subcomponents with separate - copyright notices and license terms. Your use of the source code for the these + The Apache MXNET (incubating) project contains subcomponents with separate copyright + notices and license terms. Your use of the source code for the these subcomponents is subject to the terms and conditions of the following - licenses - - - ======================================================================== - 1. Apache-2.0 license as above, wherever applicable - ======================================================================== - - ======================================================================== - 2. MIT license wherever applicable - ======================================================================== - Permission is hereby granted, free of charge, to any person obtaining a - copy of this software and associated documentation files (the "Software"), - to deal in the Software without restriction, including without limitation - the rights to use, copy, modify, merge, publish, distribute, sublicense, - and/or sell copies of the Software, and to permit persons to whom the - Software is furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included - in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - OTHER DEALINGS IN THE SOFTWARE. - - - ======================================================================== - 3. BSD License wherever applicable - ======================================================================== + licenses. + + ======================================================================================= + Apache-2.0 licenses + ======================================================================================= + + The following components are provided under an Apache 2.0 license. + + 1. MXNet Cpp-package - For details, /cpp-package/LICENSE + 2. MXNet rcnn - For details, see, example/rcnn/LICENSE + 3. scala-package - For details, see, scala-package/LICENSE + 4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE + 5. dlpack - For details, see, dlpack/LICENSE + 6. dmlc-core - For details, see, dmlc-core/LICENSE + 7. mshadow - For details, see, mshadow/LICENSE + 8. nnvm/dmlc-core - For details, see, nnvm/dmlc-core/LICENSE + 9. nnvm - For details, see, nnvm/LICENSE + 10. nnvm-fusion - For details, see, nnvm/plugin/nnvm-fusion/LICENSE + 11. ps-lite - For details, see, ps-lite/LICENSE + 12. nnvm/tvm - For details, see, nnvm/tvm/LICENSE + 13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE + + + ======================================================================================= + MIT licenses + ======================================================================================= + + 1. Fast R-CNN - For details, see example/rcnn/LICENSE + 2. Faster R-CNN - For details, see example/rcnn/LICENSE + 3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE + 4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt + 5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE + + + ======================================================================================= + NVIDIA Licenses + ======================================================================================= + + 1. Moderngpu + For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE + + /****************************************************************************** + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + 2. CUB Library + For details, see, 3rdparty/cub/LICENSE.TXT + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + + ======================================================================================= + Other Licenses + ======================================================================================= + + 1. Caffe + For details, see, example/rcnn/LICENSE + + LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + CONTRIBUTION AGREEMENT + + By contributing to the BVLC/caffe repository through pull-request, comment, + or otherwise, the contributor releases their content to the + license and copyright terms herein. + + ======================================================================================= + + 2. MS COCO API + For details, see, example/rcnn/LICENSE + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -262,6 +364,148 @@ of the authors and should not be interpreted as representing official policies, either expressed or implied, of the FreeBSD Project. + ======================================================================================= + + 3. Sphinx JavaScript utilties for the full-text search + For details, see, docs/_static/searchtools_custom.js + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ======================================================================================= + + 4. FindCrypto.cmake + For details, see, dmlc-core/cmake/Modules/FindCrypto.cmake, + Redistribution and use is allowed according to the terms of the BSD license. + + ======================================================================================= + + 5. Googlemock + For details, see, 3rdparty/googletest/googlemock/LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ======================================================================================= + + 6. Googletest + For details, see, 3rdparty/googletest/googletest/LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ======================================================================================= + + 7. OpenMP Testsuite + For details, see, 3rdparty/openmp/testsuite/LICENSE + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + o Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + o Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + o Neither the name of the University of Houston System nor the names of its + contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ======================================================================================= + + 8. Semaphore implementation in blockingconcurrentqueue.h + This file uses a semaphore implementation under the terms of its separate zlib license. + For details, see, dmlc-core/include/dmlc/blockingconcurrentqueue.h + + ======================================================================================= + + 9. blockingconcurrentqueue.h + This file is Distributed under the terms of the simplified BSD license. + For details, see, dmlc-core/include/dmlc/blockingconcurrentqueue.h + + ======================================================================================= diff --git a/NEWS.md b/NEWS.md index fc6b10188fc7..a51b514c1a51 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,46 @@ MXNet Change Log ================ +## 1.1.0 +### Usability Improvements +- Improved the usability of examples and tutorials +### Bug-fixes +- Fixed I/O multiprocessing for too many open file handles (#8904), race condition (#8995), deadlock (#9126). +- Fixed image IO integration with OpenCV 3.3 (#8757). +- Fixed Gluon block printing (#8956). +- Fixed float16 argmax when there is negative input. (#9149) +- Fixed random number generator to ensure sufficient randomness. (#9119, #9256, #9300) +- Fixed custom op multi-GPU scaling (#9283) +- Fixed gradient of gather_nd when duplicate entries exist in index. (#9200) +- Fixed overriden contexts in Module `group2ctx` option when using multiple contexts (#8867) +- Fixed `swap_axes` operator with "add_to" gradient req (#9541) +### New Features +- Added experimental API in `contrib.text` for building vocabulary, and loading pre-trained word embeddings, with built-in support for 307 GloVe and FastText pre-trained embeddings. (#8763) +- Added experimental structural blocks in `gluon.contrib`: `Concurrent`, `HybridConcurrent`, `Identity`. (#9427) +- Added `sparse.dot(dense, csr)` operator (#8938) +- Added `Khatri-Rao` operator (#7781) +- Added `FTML` and `Signum` optimizer (#9220, #9262) +- Added `ENABLE_CUDA_RTC` build option (#9428) +### API Changes +- Added zero gradients to rounding operators including `rint`, `ceil`, `floor`, `trunc`, and `fix` (#9040) +- Added `use_global_stats` in `nn.BatchNorm` (#9420) +- Added `axis` argument to `SequenceLast`, `SequenceMask` and `SequenceReverse` operators (#9306) +- Added `lazy_update` option for standard `SGD` & `Adam` optimizer with `row_sparse` gradients (#9468, #9189) +- Added `select` option in `Block.collect_params` to support regex (#9348) +- Added support for (one-to-one and sequence-to-one) inference on explicit unrolled RNN models in R (#9022) +### Deprecations +- The Scala API name space is still called `ml.dmlc`. The name space is likely be changed in a future release to `org.apache` and might brake existing applications and scripts (#9579, #9324) +### Performance Improvements +- Improved GPU inference speed by 20% when batch size is 1 (#9055) +- Improved `SequenceLast` operator speed (#9306) +- Added multithreading for the class of broadcast_reduce operators on CPU (#9444) +- Improved batching for GEMM/TRSM operators with large matrices on GPU (#8846) +### Known Issues +- "Predict with pre-trained models" tutorial is broken +- "example/numpy-ops/ndarray_softmax.py" is broken + +For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+%28incubating%29+1.1.0+Release+Notes) + + ## 1.0.0 ### Performance - Enhanced the performance of `sparse.dot` operator. diff --git a/README.md b/README.md index 57f042d09841..dbec65b0f365 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ deep learning systems, and interesting insights of DL systems for hackers. What's New ---------- +* [Version 1.1.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.1.0) - MXNet 1.1.0 Release. * [Version 1.0.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.0.0) - MXNet 1.0.0 Release. * [Version 0.12.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.1) - MXNet 0.12.1 Patch Release. * [Version 0.12.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.0) - MXNet 0.12.0 Release. From 59f0306ca0e590d344db5135851b77087cc98366 Mon Sep 17 00:00:00 2001 From: XinyuChen Date: Fri, 9 Mar 2018 09:56:39 +0800 Subject: [PATCH 02/20] Parallelization for ROIpooling OP (#9958) * parallelization for roipooling * remove some useless computation * remove useless muls * add author and retriggering * retrigger again --- src/operator/roi_pooling.cc | 48 +++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc index 10d1420950cc..acff1f97dcce 100644 --- a/src/operator/roi_pooling.cc +++ b/src/operator/roi_pooling.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file roi_pooling.cc * \brief roi pooling operator - * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo + * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo, Xinyu Chen */ #include "./roi_pooling-inl.h" #include @@ -54,13 +54,22 @@ inline void ROIPoolForward(const Tensor &out, const int num_rois = bbox.size(0); const int data_size = data.size(1) * data.size(2) * data.size(3); + const int data_size_c = data.size(2) * data.size(3); + const int out_size_c = out.size(2) * out.size(3); + const int out_size = channels_ * out_size_c; + const int max_idx_size_c = max_idx.size(2) * max_idx.size(3); + const int max_idx_size = channels_ * max_idx_size_c; // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R for (int n = 0; n < num_rois; ++n) { - int roi_batch_ind = bottom_rois[0]; - int roi_start_w = round(bottom_rois[1] * spatial_scale_); - int roi_start_h = round(bottom_rois[2] * spatial_scale_); - int roi_end_w = round(bottom_rois[3] * spatial_scale_); - int roi_end_h = round(bottom_rois[4] * spatial_scale_); + // Increment ROI data pointer + const Dtype *bottom_rois_n = bottom_rois + n * bbox.size(1); + Dtype *top_data_n = top_data + n * out_size; + Dtype *argmax_data_n = argmax_data + n * max_idx_size; + int roi_batch_ind = bottom_rois_n[0]; + int roi_start_w = round(bottom_rois_n[1] * spatial_scale_); + int roi_start_h = round(bottom_rois_n[2] * spatial_scale_); + int roi_end_w = round(bottom_rois_n[3] * spatial_scale_); + int roi_end_h = round(bottom_rois_n[4] * spatial_scale_); assert(roi_batch_ind >= 0); assert(static_cast(roi_batch_ind) < data.size(0) /* batch size */); @@ -74,12 +83,18 @@ inline void ROIPoolForward(const Tensor &out, const Dtype* batch_data = bottom_data + data_size * roi_batch_ind; + #pragma omp parallel for for (int c = 0; c < channels_; ++c) { + // Increment all data pointers + const Dtype* batch_data_c = batch_data + c * data_size_c; + Dtype* top_data_c = top_data_n + c * out_size_c; + Dtype* argmax_data_c = argmax_data_n + c * max_idx_size_c; + for (int ph = 0; ph < pooled_height_; ++ph) { for (int pw = 0; pw < pooled_width_; ++pw) { // Compute pooling region for this output unit: - // start (included) = floor(ph * roi_height / pooled_height_) - // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) + // start (included) = floor(ph * roi_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) @@ -98,30 +113,23 @@ inline void ROIPoolForward(const Tensor &out, const int pool_index = ph * pooled_width_ + pw; if (is_empty) { - top_data[pool_index] = 0; - argmax_data[pool_index] = -1; + top_data_c[pool_index] = 0; + argmax_data_c[pool_index] = -1; } for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width_ + w; - if (batch_data[index] > top_data[pool_index]) { - top_data[pool_index] = batch_data[index]; - argmax_data[pool_index] = index; + if (batch_data_c[index] > top_data_c[pool_index]) { + top_data_c[pool_index] = batch_data_c[index]; + argmax_data_c[pool_index] = index; } } } } } - // Increment all data pointers by one channel - batch_data += data.size(2) * data.size(3); - top_data += out.size(2) * out.size(3); - argmax_data += max_idx.size(2) * max_idx.size(3); } - // Increment ROI data pointer - bottom_rois += bbox.size(1); } - return; } From 1e270b1acb34e539886429ec7e8d36f941c3e145 Mon Sep 17 00:00:00 2001 From: chsin Date: Fri, 9 Mar 2018 03:06:17 -0500 Subject: [PATCH 03/20] comments to copy and copyto are corrected (#10040) --- cpp-package/include/mxnet-cpp/ndarray.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h index 082c06981cf9..1166643e4e8a 100644 --- a/cpp-package/include/mxnet-cpp/ndarray.h +++ b/cpp-package/include/mxnet-cpp/ndarray.h @@ -291,15 +291,15 @@ class NDArray { */ void SyncCopyToCPU(std::vector *data, size_t size = 0); /*! - * \brief Copy the content of current array to other. - * \param other the new context of this NDArray - * \return the new copy + * \brief copy the content of current array to a target array. + * \param other the target NDArray + * \return the target NDarray */ NDArray CopyTo(NDArray * other) const; /*! - * \brief return a new copy this NDArray - * \param other the target NDArray - * \return the copy target NDarray + * \brief return a new copy to this NDArray + * \param Context the new context of this NDArray + * \return the new copy */ NDArray Copy(const Context &) const; /*! From 63074ce96db71d277cb3b7a7249f9528a45c0df8 Mon Sep 17 00:00:00 2001 From: chinakook Date: Fri, 9 Mar 2018 16:12:28 +0800 Subject: [PATCH 04/20] Bug Fix and performance optimized for rtc (#10018) * Bug Fix and performance optimized for rtc 1. "super().__init__()" bug is fixed in python 2. 2. Kernel is initialized in the stage of operator init. * Update custom_softmax_rtc.py fix unnessesary format --- example/numpy-ops/custom_softmax_rtc.py | 131 +++++++++++++----------- 1 file changed, 72 insertions(+), 59 deletions(-) diff --git a/example/numpy-ops/custom_softmax_rtc.py b/example/numpy-ops/custom_softmax_rtc.py index 906cbbeac04c..d07041b002d3 100644 --- a/example/numpy-ops/custom_softmax_rtc.py +++ b/example/numpy-ops/custom_softmax_rtc.py @@ -23,51 +23,77 @@ class Softmax(mx.operator.CustomOp): def __init__(self): - self.fwd_kernel_mod = None - self.bwd_kernel_mod = None - super().__init__() + super(Softmax,self).__init__() + # Each thread processes a row (a sample in the batch). + fwd_src = r""" + template + __global__ void fwd(const DType* x, DType* y, const int row_size, const int req) { + const int offset = row_size * threadIdx.x; + DType max = x[offset]; + for(int i = 1; i < row_size; ++i) { + if(max < x[offset + i]) { + max = x[offset + i]; + } + } + DType sum = 0; + for(int i = 0; i < row_size; ++i) { + sum += exp(x[offset + i] - max); + } + switch(req) { + case 1: + for(int i = 0; i < row_size; ++i) { + y[offset + i] = exp(x[offset + i] - max) / sum; + } + break; + case 2: + for(int i = 0; i < row_size; ++i) { + y[offset + i] += exp(x[offset + i] - max) / sum; + } + break; + } + } + """ + + # Each block processes a row and each thread in a block calculate an element of `dx`. + bwd_src = r""" + template + __global__ void bwd(const DType* l, const DType* y, DType* dx, const int req) { + const int z = static_cast(l[blockIdx.x]); + const int i = threadIdx.x + blockDim.x * blockIdx.x; + if(req == 1) { + dx[i] = threadIdx.x == z ? y[i] - 1 : y[i]; + } else { + dx[i] += threadIdx.x == z ? y[i] - 1 : y[i]; + } + } + """ + fwd_kernel_mod = mx.rtc.CudaModule(fwd_src, exports=["fwd", "fwd"]) + bwd_kernel_mod = mx.rtc.CudaModule(bwd_src, exports=["bwd", "bwd"]) + + fwd_kernel_float_signature = "const float*, const float*, const int, const int" + self.fwd_float_kernel = fwd_kernel_mod.get_kernel("fwd", fwd_kernel_float_signature) + + bwd_kernel_float_signature = "const float*, const float*, float*, const int" + self.bwd_float_kernel = bwd_kernel_mod.get_kernel("bwd", bwd_kernel_float_signature) + + fwd_kernel_double_signature = "const double*, const double*, const int, const int" + self.fwd_double_kernel = fwd_kernel_mod.get_kernel("fwd", fwd_kernel_double_signature) + + bwd_kernel_double_signature = "const double*, const double*, double*, const int" + self.bwd_double_kernel = bwd_kernel_mod.get_kernel("bwd", bwd_kernel_double_signature) def forward(self, is_train, req, in_data, out_data, aux): if req[0] == "null": return x = in_data[0] # input y = out_data[0] # output - if self.fwd_kernel_mod is None: - # Each thread processes a row (a sample in the batch). - src = r""" - template - __global__ void fwd(const DType* x, DType* y, const int row_size, const int req) { - const int offset = row_size * threadIdx.x; - DType max = x[offset]; - for(int i = 1; i < row_size; ++i) { - if(max < x[offset + i]) { - max = x[offset + i]; - } - } - DType sum = 0; - for(int i = 0; i < row_size; ++i) { - sum += exp(x[offset + i] - max); - } - switch(req) { - case 1: - for(int i = 0; i < row_size; ++i) { - y[offset + i] = exp(x[offset + i] - max) / sum; - } - break; - case 2: - for(int i = 0; i < row_size; ++i) { - y[offset + i] += exp(x[offset + i] - max) / sum; - } - break; - } - } - """ - self.fwd_kernel_mod = mx.rtc.CudaModule(src, exports=["fwd", "fwd"]) - dtype = "double" if y.dtype == np.float64 else "float" - kernel_signature = "const {0}*, const {0}*, const int, const int".format(dtype) - kernel = self.fwd_kernel_mod.get_kernel("fwd<{}>".format(dtype), kernel_signature) - # args, ctx, grid_shape, block_shape, shared_mem = 0 - kernel.launch((x, y, x.shape[1], self._reqCode(req[0])), mx.gpu(0), (1, 1, 1), (x.shape[0], 1, 1)) + + if y.dtype == np.float64: + # args, ctx, grid_shape, block_shape, shared_mem = 0 + self.fwd_double_kernel.launch((x, y, x.shape[1], self._reqCode(req[0])), mx.gpu(0), (1, 1, 1), (x.shape[0], 1, 1)) + else: + # args, ctx, grid_shape, block_shape, shared_mem = 0 + self.fwd_float_kernel.launch((x, y, x.shape[1], self._reqCode(req[0])), mx.gpu(0), (1, 1, 1), (x.shape[0], 1, 1)) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): if req[0] == "null": @@ -75,26 +101,13 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux): l = in_data[1] # label y = out_data[0] # output from the forward pass dx = in_grad[0] # the storage for the gradient - if self.bwd_kernel_mod is None: - # Each block processes a row and each thread in a block calculate an element of `dx`. - src = r""" - template - __global__ void bwd(const DType* l, const DType* y, DType* dx, const int req) { - const int z = static_cast(l[blockIdx.x]); - const int i = threadIdx.x + blockDim.x * blockIdx.x; - if(req == 1) { - dx[i] = threadIdx.x == z ? y[i] - 1 : y[i]; - } else { - dx[i] += threadIdx.x == z ? y[i] - 1 : y[i]; - } - } - """ - self.bwd_kernel_mod = mx.rtc.CudaModule(src, exports=["bwd", "bwd"]) - dtype = "double" if dx.dtype == np.float64 else "float" - kernel_signature = "const {0}*, const {0}*, {0}*, const int".format(dtype) - kernel = self.bwd_kernel_mod.get_kernel("bwd<{}>".format(dtype), kernel_signature) - # args, ctx, grid_shape, block_shape, shared_mem = 0 - kernel.launch((l, y, dx, self._reqCode(req[0])), mx.gpu(0), (y.shape[0], 1, 1), (y.shape[1], 1, 1)) + + if dx.dtype == np.float64: + # args, ctx, grid_shape, block_shape, shared_mem = 0 + self.bwd_double_kernel.launch((l, y, dx, self._reqCode(req[0])), mx.gpu(0), (y.shape[0], 1, 1), (y.shape[1], 1, 1)) + else: + # args, ctx, grid_shape, block_shape, shared_mem = 0 + self.bwd_float_kernel.launch((l, y, dx, self._reqCode(req[0])), mx.gpu(0), (y.shape[0], 1, 1), (y.shape[1], 1, 1)) def _reqCode(self, req): if(req == "write"): From df974e0d9c4ce74e86543ca0bab00891adabe3c4 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Sun, 11 Mar 2018 15:55:48 -0700 Subject: [PATCH 05/20] set embedding --- python/mxnet/text/__init__.py | 25 + python/mxnet/text/_constants.py | 704 +++++++++++++++++++++++++++++ python/mxnet/text/embedding.py | 553 ++++++++++++++++++++++ python/mxnet/text/utils.py | 77 ++++ python/mxnet/text/vocab.py | 254 +++++++++++ tests/python/unittest/test_text.py | 648 ++++++++++++++++++++++++++ 6 files changed, 2261 insertions(+) create mode 100644 python/mxnet/text/__init__.py create mode 100644 python/mxnet/text/_constants.py create mode 100644 python/mxnet/text/embedding.py create mode 100644 python/mxnet/text/utils.py create mode 100644 python/mxnet/text/vocab.py create mode 100644 tests/python/unittest/test_text.py diff --git a/python/mxnet/text/__init__.py b/python/mxnet/text/__init__.py new file mode 100644 index 000000000000..44e1ad980bc5 --- /dev/null +++ b/python/mxnet/text/__init__.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +"""This module includes utilities for indexing and embedding text.""" + +from .vocab import * + +from . import embedding + +from .utils import * diff --git a/python/mxnet/text/_constants.py b/python/mxnet/text/_constants.py new file mode 100644 index 000000000000..74530e05e779 --- /dev/null +++ b/python/mxnet/text/_constants.py @@ -0,0 +1,704 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 + +"""Read text files and load embedding.""" +from __future__ import absolute_import +from __future__ import print_function + +UNKNOWN_IDX = 0 + +GLOVE_PRETRAINED_FILE_SHA1 = \ + {'glove.42B.300d.zip': 'f8e722b39578f776927465b71b231bae2ae8776a', + 'glove.6B.zip': 'b64e54f1877d2f735bdd000c1d7d771e25c7dfdc', + 'glove.840B.300d.zip': '8084fbacc2dee3b1fd1ca4cc534cbfff3519ed0d', + 'glove.twitter.27B.zip': 'dce69c404025a8312c323197347695e81fd529fc'} + +GLOVE_PRETRAINED_ARCHIVE_SHA1 = \ + {'glove.42B.300d.txt': '876767977d6bd4d947c0f84d44510677bc94612a', + 'glove.6B.50d.txt': '21bf566a9d27f84d253e0cd4d4be9dcc07976a6d', + 'glove.6B.100d.txt': '16b1dbfaf35476790bd9df40c83e2dfbd05312f1', + 'glove.6B.200d.txt': '17d0355ddaa253e298ede39877d1be70f99d9148', + 'glove.6B.300d.txt': '646443dd885090927f8215ecf7a677e9f703858d', + 'glove.840B.300d.txt': '294b9f37fa64cce31f9ebb409c266fc379527708', + 'glove.twitter.27B.25d.txt': + '767d80889d8c8a22ae7cd25e09d0650a6ff0a502', + 'glove.twitter.27B.50d.txt': + '9585f4be97e286339bf0112d0d3aa7c15a3e864d', + 'glove.twitter.27B.100d.txt': + '1bbeab8323c72332bd46ada0fc3c99f2faaa8ca8', + 'glove.twitter.27B.200d.txt': + '7921c77a53aa5977b1d9ce3a7c4430cbd9d1207a'} + +FAST_TEXT_ARCHIVE_SHA1 = \ + {'crawl-300d-2M.zip': 'bb40313d15837ceecc1e879bc954e9be04b17c3c', + 'wiki.aa.zip': '0d85feb259e17d5258f38b2b615a2b87cd628427', + 'wiki.ab.zip': '7a8c555b9cf3837c9b31c901e9e0142209990365', + 'wiki.ace.zip': '51555fccbe53b726f6c86a84d704c026a78dd02f', + 'wiki.ady.zip': '725d2c30c03001c941ac4084549c55c7f8e1d766', + 'wiki.af.zip': '1a18d34e1b60433b837f5850750a44ca3845323d', + 'wiki.ak.zip': 'daecc2303cfd05bc6c33b24d78c14e0d7f33e3a7', + 'wiki.als.zip': '38851192e0b556e566be6c3c93370abf9867e525', + 'wiki.am.zip': '4576e0121448564b07f448e05e287236343f17c1', + 'wiki.ang.zip': '9c03da3b06d4becef5d387b9a61438b9362fc36a', + 'wiki.an.zip': '170f60bdd161cf8e4b5e018acd7d36e8bfc457a6', + 'wiki.arc.zip': 'c8dad8b00865bf736b087e7b323999ab404bda29', + 'wiki.ar.zip': '34e9869daa463fdc5609040ff33a03e67512e9fd', + 'wiki.arz.zip': '2d2790e11e401d46e1bce2970ee5264d5678a32b', + 'wiki.ast.zip': '1136515e2de556c077324bcd42ffe7f40c8d94c6', + 'wiki.as.zip': 'f9efde3e4ccda4a1e93fa275a3210f74036e9e46', + 'wiki.av.zip': '9f8568a3e094a48de4a3b6bea3bdb6fd7e875a08', + 'wiki.ay.zip': 'f09a422cedc6a0f15fbf30d290febe8057de83db', + 'wiki.azb.zip': 'd8895581050b9fdb5a10dfec3e27910a150b6faf', + 'wiki.az.zip': '2a34c2db872597ba3e345ce8b7db138241f9efbf', + 'wiki.bar.zip': 'd6e40135a6f4ba7a07fab11633034eccb1b05d0a', + 'wiki.bat_smg.zip': '5d08bd04f0515a36723776c0682b3de0f11d4264', + 'wiki.ba.zip': '412ac2f3bf9a605e56e2b0990bb0baed41ddf3b0', + 'wiki.bcl.zip': 'd3717cda357e08390cb57a64e07f5c7b7768d5be', + 'wiki.be.zip': 'b691e63b8080af23cc37f5f2b21b3154e464c425', + 'wiki.bg.zip': '08509a510a95e2a8905c19d83faf40d614d2268b', + 'wiki.bh.zip': 'a812600c6454b779d442b7680e3867e15d895095', + 'wiki.bi.zip': 'd0d4a3f57419424815f77b3951ef9c7336f6adf5', + 'wiki.bjn.zip': '0d81879ff7611380896eac6059bb677a5b3fe308', + 'wiki.bm.zip': 'f3a2a1a8dbc94973a74343c059595a310a66665b', + 'wiki.bn.zip': 'b3bc70520edf3963c2217873ff5c2537d3545650', + 'wiki.bo.zip': '2be9fe7701d6a8501461df7bd98fee26859cf83a', + 'wiki.bpy.zip': 'd44b9267bb4f86e3e43972a6a952cc0ccf90dd3c', + 'wiki.br.zip': '4bfa66f1ea5aa5cad736eccaa211f6025596bcd6', + 'wiki.bs.zip': '40c560c5994ab50485d08eeaffd88740f30236ab', + 'wiki.bug.zip': 'bc7cd87bb067ac477000259cd4f95f45bfb6e4df', + 'wiki.bxr.zip': '8396fd67ef53f3123540766788a0db54734c4f1a', + 'wiki.ca.zip': '8f5d3caf0f5d223b2771ec44f7e620e396974fb2', + 'wiki.cbk_zam.zip': '0af3be50823b564433455d10c8753df88461458f', + 'wiki.cdo.zip': '19024215aa0c13872c027fc6127b5d7506198b5f', + 'wiki.ceb.zip': '96374428bf36a43983ba4307d7f6fb5ab52a6c6a', + 'wiki.ce.zip': 'b27f1a8da448bc9315e15d4261519c64f00de8eb', + 'wiki.cho.zip': '20944e34c2b58f14adb849dd5a6f5168c7affdea', + 'wiki.chr.zip': 'b7f41ee3fa76e933e0b5ad6b793c507fc19afe98', + 'wiki.chy.zip': '4ef66004a609c724fd7d8aab2877f7634323d43f', + 'wiki.ch.zip': '7f73678b685c9b5f5d6eea9bc00322cfc18d40cb', + 'wiki.ckb.zip': 'b7db2805526ad8bed878af257b32ca9ba814855f', + 'wiki.co.zip': '1b9e19b11763cb87ca00520dbdd6ada565547c9c', + 'wiki.crh.zip': '792003bae25c4471d25721440002c983fa5af020', + 'wiki.cr.zip': '875e4aa0de8a829e57f6c8e13d43cac5103210de', + 'wiki.csb.zip': 'fa776014c4c83487d7cb2485bd08eaf6739d9dca', + 'wiki.cs.zip': 'dca18cb80460522cd281ccc3c9922cf2b3c08b81', + 'wiki.cu.zip': 'ed23b48ba3193181a358d7a73005afa7655a4fc3', + 'wiki.cv.zip': '27ccd50942c9c218e00365ee293fa0c3087a7646', + 'wiki.cy.zip': '78940d5be2969b82c99f785bda2ac5f4e18e149c', + 'wiki.da.zip': 'a45077d9d73328bd6a96efdba1b31ed9a3639dcd', + 'wiki.de.zip': '0d9e4bf80100b46237dcb73cfefe390103e7e827', + 'wiki.diq.zip': '0eef7d9e2f0ce3f100a22dc8fcede9449e466528', + 'wiki.dsb.zip': '903cd80550931effba1d4e52a19c22592837d11c', + 'wiki.dv.zip': '3fa06719641ff33ac8a5439d330a8108521da1e7', + 'wiki.dz.zip': '8bf3937971c3c996493c30b264cb8268627d7bd6', + 'wiki.ee.zip': 'e66bc50013d884fe69f4f67ba44af2e34fe97927', + 'wiki.el.zip': '3015f358036658fb126d42fa794d67a90c5b91ad', + 'wiki.eml.zip': '5be541be6115af5914ac2b8118a09232b771123b', + 'wiki.en.zip': '7f83d578a31a8168423c77ea25ad381494a5e920', + 'wiki.eo.zip': 'e7612df98c37cb872f0edc3c3e21dcd2f80a4d69', + 'wiki.es.zip': '1b7668b23db26810ea433173ce0c11281e801f74', + 'wiki.et.zip': 'aa31004e7b8ebf359e166b8ea6b8e6f77fac190f', + 'wiki.eu.zip': '8d7699451cbac4d69750caa8d58b4740cc72e0ca', + 'wiki.ext.zip': '3aeb4d77c48eb503b26ceb2a76a0a7d841124a71', + 'wiki.fa.zip': '08b6e805c8623fba526143d46f4685549c4380a6', + 'wiki.ff.zip': '64f690eda733a6fb4f794e42eb6ff05f09ec1d38', + 'wiki.fiu_vro.zip': '35c3fdcec0f0dc1ce303212967ea59936641daee', + 'wiki.fi.zip': '252299a2a59cc0ac07ba25f9458afc26bbac669f', + 'wiki.fj.zip': '004d1279c27324d02b961341cf0d6ee06dbe8966', + 'wiki.fo.zip': '12f1d6360d4867cdebcc93be87c024a4709d1af5', + 'wiki.frp.zip': '8a0f636b5440a9aab38014efada9edfdf94150d5', + 'wiki.frr.zip': '7c9e7b8109b98aa39b303dd77d837b37e96d4113', + 'wiki.fr.zip': 'd906e68760153d771e5982009b0150e913254b2d', + 'wiki.fur.zip': 'd5d2ae08696ed074a581eac563a60eb85467a792', + 'wiki.fy.zip': '342609d29882fae0a3b402d8ea1478606be0d93b', + 'wiki.gag.zip': 'f2b91f89dd9b9a1301727476f7823b7260b5f129', + 'wiki.gan.zip': 'd3ad3c1151555266e1feb9f98b066ee31ee5f410', + 'wiki.ga.zip': '798b0c26783c7af05d9c4f899ca9fddafeb1e0a1', + 'wiki.gd.zip': '49085fa182a528bdc51f10e99bef33c88c1e3112', + 'wiki.glk.zip': '9e16727ffcc691483b69ecbcd331b1df2efa4bcd', + 'wiki.gl.zip': 'c71c7e6601b2cbdc7930982fbeea636deddd107d', + 'wiki.gn.zip': '493ccb583211217ccd23e0a43f42ba773bd94f78', + 'wiki.gom.zip': '45bbd49750ddb7df5afe01fcfd5dda2958934dfa', + 'wiki.got.zip': '669d018f72827fb965e5ef37e224e21f4682b2e5', + 'wiki.gu.zip': '4afe874f7830f693e9f83508fc3fb444b33aebdf', + 'wiki.gv.zip': '9411197eebc07775949d9bb6e440780a68502a5c', + 'wiki.hak.zip': 'cd1e14bd5d50fa764883b148bda5b821375531e0', + 'wiki.haw.zip': 'cacd4eb4e476bdd842e8014764b8ae380b346ed2', + 'wiki.ha.zip': '14acc50950b451f40fe028fd08d042af44732398', + 'wiki.he.zip': 'a9e2cd13bc2e55d83820c529bac1f518a7198bc0', + 'wiki.hif.zip': 'dcdd488239deb0ede807cff263ddc972009c21f5', + 'wiki.hi.zip': '15899ec17985bc0e1db1df497e1b4a51bba1982b', + 'wiki.ho.zip': 'fde454bb4f3841ea5dde2bbf879138305a4d0b36', + 'wiki.hr.zip': 'f5d33ba967f7c56538fa9f5f0093f6d634e9db44', + 'wiki.hsb.zip': '64dc13c7645d2b65b8ba252bd8dfb1c616e8923a', + 'wiki.ht.zip': 'cf50a5cadcf91aba9ab58d095d65f348e2375d12', + 'wiki.hu.zip': 'b27f293caedf81a2d09204b11f52a7c8d7443643', + 'wiki.hy.zip': '641b8666bc2168998989fae1b20a09d3428766bb', + 'wiki.hz.zip': '1639f9f096de6fac84336a784a391ce73e523d62', + 'wiki.ia.zip': '37640aaf8a25c02883190951337b5a6f0157d781', + 'wiki.id.zip': '56ee0c7a38a6d232706932493eaa37b2a87667ee', + 'wiki.ie.zip': '7c3a5d7f96c801570e2305f45a40d401fcc038b9', + 'wiki.ig.zip': '405ebc2e8a959163c9f2f8dd015a0bcefd440111', + 'wiki.ii.zip': '1ec1c7d95d61eeca2dbbd8e432caf88524aaf28e', + 'wiki.ik.zip': 'e9d088c0d8d0ab420d6d0469c6a0fdb668f1833c', + 'wiki.ilo.zip': 'cbc9754978ce55e86da2eb3db20579f4a1f19947', + 'wiki.io.zip': '9e5ab1fd5c4f1094d111f501129e0eecccec69a0', + 'wiki.is.zip': '0744e63636cf794e0a406c922827628a3dd415b7', + 'wiki.it.zip': '29f4eb6a5d7dcf45b02b4d08a4a70dfae4c41200', + 'wiki.iu.zip': 'fb2e8de825d554257768d363a3a09f711afb001b', + 'wiki.jam.zip': '077cfb6de9d025aee4a5b2ea9ce15ada02f10a4f', + 'wiki.ja.zip': '7940f6c2bc490c04902f0faf0562b92cae7136bf', + 'wiki.jbo.zip': '3d086b6c9a369f197516cd0dc699a94612f45c6a', + 'wiki.jv.zip': '2f68cb3436b27a25ddfa40fab3e2cd44574b437e', + 'wiki.kaa.zip': '9fd5df362b7cb615f2267084d8b3fb8608be2693', + 'wiki.kab.zip': '96abf1440ad21de58d7274d3a16885ef4a2efda4', + 'wiki.ka.zip': '72ddb2382c87184fc05a93e89ed8aa4f54a62a0a', + 'wiki.kbd.zip': '81dfc3c6f8581c2aa15342c84688b4ba59b81cc6', + 'wiki.kg.zip': '4d07cabef6f804fc6432d3f630675ed4cbbdd49e', + 'wiki.ki.zip': '59b5c31df227ff9454ad8b3a1d16b065620dbddf', + 'wiki.kj.zip': '751b80c4a4d82dd217d3d2b3905eb39b349874d7', + 'wiki.kk.zip': '7fb733a2405f421a7c49b756381a52965a8af205', + 'wiki.kl.zip': '05a9d5c9bf12d8845356f88b546418d2e40f79c6', + 'wiki.km.zip': 'da0a67028fa0244a2e7257ae259c2f7a7544dc66', + 'wiki.kn.zip': '6cead946350b31fb2f353085fd00b8ea9c9ecc77', + 'wiki.koi.zip': '0c61f83434404267527eaf583e89b4d8bb3a6a65', + 'wiki.ko.zip': 'c0825282faf1e7af6820bd8b28d06c77760dcbe4', + 'wiki.krc.zip': '0df3c3f0f89521299dab741be3d698b2c94c194e', + 'wiki.kr.zip': '71651f046cef420fb28ca15e35720bb7747c4586', + 'wiki.ksh.zip': '8b9ab88baa49e72e40a5a80bef98f3ea2afbdd07', + 'wiki.ks.zip': '02af37f12753662c9e7bcac3b8786dfd2f298710', + 'wiki.ku.zip': 'ca1d370b327ceca025884bf83139456024a3a978', + 'wiki.kv.zip': '28b3617c5566f3182f14bf11a906456b227840ba', + 'wiki.kw.zip': '075a02e8eaae26897c23898fb4d36f4e41e4d1d0', + 'wiki.ky.zip': '771601a934cd4d0a98e5059f6389d2496e8dcf7c', + 'wiki.lad.zip': '2788ba3f275d72299e877c96cde106bd8590f405', + 'wiki.la.zip': '759f6365874442ab8e04d992b047f53ad74231a6', + 'wiki.lbe.zip': 'c8105f1cf8a3d46ccfacff1d40a581f442b3c4a1', + 'wiki.lb.zip': 'dac5af52364f2c0d3a0c794411465d1254f2fb48', + 'wiki.lez.zip': '17331cb779dee8cb60f2734213af80d57acfcfad', + 'wiki.lg.zip': 'fd4e2d67d1f098474053abc9a1984dfe4a2854b7', + 'wiki.lij.zip': 'c29157f5e4d2b37c01cf6e389f03ddafef6acdb2', + 'wiki.li.zip': '10490e49a12230af2127543da69c427f92c6508f', + 'wiki.lmo.zip': 'cc44163572deddd78af6b006394f623cb21934fc', + 'wiki.ln.zip': 'bf52699c5cbf79bedb2e2856d8a720189b6864f3', + 'wiki.lo.zip': '3fd8a70d8e26071a365f10016875a4a4f15ffcee', + 'wiki.lrc.zip': 'e262b4fcc55cba48d997cd06d006b82a5abe09a9', + 'wiki.ltg.zip': 'df6a83f2fab35f9a2f97fd8d857cb1cfa59f331f', + 'wiki.lt.zip': 'a738a3f29a6a5481082a7a9a41b2040b9cf537e4', + 'wiki.lv.zip': '8e328d99aacaa021fcc51425caebc063e22e6cf4', + 'wiki.mai.zip': 'e909de86c27eced2cb5f02f550da7fc2502b5eda', + 'wiki.map_bms.zip': '192bf6b88f955746abb398893868482730585e3a', + 'wiki.mdf.zip': '3d0d5da3c85bef8ae52f0fd17e314a1960a26d36', + 'wiki.mg.zip': 'fe66055b63ce8771bf43f8dd543bbd967f8ea8b3', + 'wiki.mhr.zip': '33514c98da3bd9602851db96fa3dd8192aac0674', + 'wiki.mh.zip': 'dc77309103c6cfed7ff095b3f9f158e1ae437e71', + 'wiki.min.zip': '8b925eea6df0411ee09baef5801d807cfec8cfa4', + 'wiki.mi.zip': 'd57831e8d7cb2ec260fc9d83d4281f0bacfb29a5', + 'wiki.mk.zip': 'b1fc2d85527e99530a93e3bbc5fa9fcde89910f3', + 'wiki.ml.zip': 'b9d53b8e76a05f5e959afd190da3015b36793297', + 'wiki.mn.zip': '715bf0ee67b48ec872659380fcf63ad006ddcc7e', + 'wiki.mo.zip': 'fb273fe373eb61310051d94ad6911320f573d0ec', + 'wiki.mrj.zip': 'b0d1e43e37e1718c8e05fd81a511095636def361', + 'wiki.mr.zip': '67e942a7742cc957298c8cd0cd0af0531dc936d7', + 'wiki.ms.zip': 'e218f113702b039fc8e80a77b894cd9fa4eff77d', + 'wiki.mt.zip': 'd68d5b636eac07b2e1307186c2c05b9a80e39658', + 'wiki.multi.ar.zip': '31c7b742c63c3367e9bce5c4dca37d5ceb33f1a6', + 'wiki.multi.bg.zip': '8991e8123bce7fd6c8e4510c71ede5715ae36f01', + 'wiki.multi.ca.zip': '0786e071438150485d394a4bf2e976d3a1b313ff', + 'wiki.multi.cs.zip': '7237f291146e69f0fc7002a0e175c7fd003d44e8', + 'wiki.multi.da.zip': '5591c20015191101aee190c02738c99073a8fe76', + 'wiki.multi.de.zip': '986160e51a08f4a93f1573d17352e375cbaedd6d', + 'wiki.multi.el.zip': '570eb12811ce61f6176f263eff3e945be69e7da0', + 'wiki.multi.en.zip': '2c3ef35d8338d4a905e7d10645572ab7a6730d44', + 'wiki.multi.es.zip': 'c1db7c7175665a7230f92ed038b78de780e060e9', + 'wiki.multi.et.zip': '54d0515865c754331b445dd9ba0ae7ed79b770aa', + 'wiki.multi.fi.zip': 'c94abc803a42b89cd75b278114b1f2cf4e2f3ecd', + 'wiki.multi.fr.zip': 'd4904b79eaf8ae386a7011ad84afc9b4238c9928', + 'wiki.multi.he.zip': '370ec2a379eecc2d2e984cde3e0f6d0a027eade7', + 'wiki.multi.hr.zip': 'd3f25ae76b040ffa09e964f6edc55488f6086394', + 'wiki.multi.hu.zip': '4b64bcdf0fc1f01bbd8427bd7bf6b46319308e7a', + 'wiki.multi.id.zip': '3ad5f590d5c847b35a334f1bdb48b9c466f5de68', + 'wiki.multi.it.zip': '18746450e665e96c33f2e2026986f643a27e0945', + 'wiki.multi.mk.zip': '1d899f1449d8729b7dbae226f05151a656694626', + 'wiki.multi.nl.zip': 'ff0a04dbb07c2cdbc61d5a241175e30ed46b48d4', + 'wiki.multi.no.zip': 'd1af729024181e64f58ae37ab233fc53811e2601', + 'wiki.multi.pl.zip': '91c3984c4f3158b1cb1ff11d8cc4f9240631266e', + 'wiki.multi.pt.zip': 'a1782c4fa4337008f82c0e2bf78e4323d145be29', + 'wiki.multi.ro.zip': 'b1a0840d084009ce00c47a3c24c984648dbe8785', + 'wiki.multi.ru.zip': '540607ba4334dab6089de463f974861aac8a35ae', + 'wiki.multi.sk.zip': '2a2bb39e011cf2bf6dcb8cb6c482b8eb9764eea3', + 'wiki.multi.sl.zip': '99442dab442dc196c107868db9174c78e270db1e', + 'wiki.multi.sv.zip': 'b40be83d2d7c27633c712aea62ceec0d409cc03a', + 'wiki.multi.tr.zip': 'e2bffab1616f54d180ba3d8bfe5e94ec9a489184', + 'wiki.multi.uk.zip': 'e97f64d9ba2b58a5e80c9b896b87340aba1e0eb0', + 'wiki.multi.vi.zip': '532fa24d8787a8906fb04a88e74a713b00cb33ec', + 'wiki.mus.zip': '1bb0cad10889b8a3bfa36c36c7da1f2fb2237bb8', + 'wiki.mwl.zip': 'e3d1fd1fa6290521d403e84eba577e552e330844', + 'wiki.myv.zip': '64a6505691441778766b7941b5e7f45a624a64a5', + 'wiki.my.zip': '491ce8dbf174d4abff758db4950f49eda90883d9', + 'wiki.mzn.zip': '76abf410749fd4516ead20ced891b54245fcd4a3', + 'wiki.nah.zip': '0496592cdd70eaf61b257fb5345843d38f425592', + 'wiki.nap.zip': 'f0df66cdbef5734f0afeb806cda631722fb426d8', + 'wiki.na.zip': '2456e4776b5e985cfaedfac244e0b40cff4e613c', + 'wiki.nds_nl.zip': 'ffd10e05b749281634eb7a758102d8d6ff42760e', + 'wiki.nds.zip': '2455e9fa4294828b25b32bdad7307a105f9fbe1d', + 'wiki-news-300d-1M-subword.zip': '697f4c8f37443be3aee7b96abe28fd7ebec95ef3', + 'wiki-news-300d-1M.zip': '567ef9c2e207be25da23e61312e6ba620da30466', + 'wiki.new.zip': 'a781885678cc1079d4be221c414339eb9bee8d19', + 'wiki.ne.zip': '180b068343288cda40d012aaa99d29459d341eb4', + 'wiki.ng.zip': '6db8111ab700f7b0841af87f1f1453341048014e', + 'wiki.nl.zip': '582420f290947cf38503b7f4b8ea9bb21918005e', + 'wiki.nn.zip': '4a0e30376b361ee19800e6d897a865572e330f84', + 'wiki.nov.zip': 'ac98c0300302019ff855698561708abd81730db3', + 'wiki.no.zip': '6893a7912ab3756e31d09ef1f9023c27c0b047f8', + 'wiki.nrm.zip': 'bd27aadf25a165ebbac486437ea6a06b710fdda6', + 'wiki.nso.zip': 'c55dfebb83351c952831db34e779e0a380212f05', + 'wiki.nv.zip': 'cf122e5ee041287917c594a2cb6cd247978f1ec0', + 'wiki.ny.zip': '9086021a60babd7e87afa469dbadb004523f5fd2', + 'wiki.oc.zip': '15075544cf837135127d8688cd06fb8e4c8b7f3d', + 'wiki.olo.zip': '523628bb652e1563b4dd5a94b518addf10699f74', + 'wiki.om.zip': 'a29360ab3930d889c4eb5b385589f84c1ff9f06e', + 'wiki.or.zip': 'a782e649ae5307dece445b0c11b15ffb9ce88297', + 'wiki.os.zip': '0d76ca005afd48b87dea5c9784c4c48bb51d3e3e', + 'wiki.pag.zip': 'b046ef71badc9d7eec161e3aec2ffc3abb7bad20', + 'wiki.pam.zip': 'abed25ef407e05209f2653d571bba5bc7c66e7b3', + 'wiki.pap.zip': '5d099bfc65c85f824634a191ce33e8e42f947ded', + 'wiki.pa.zip': '2066ed0016720b9f8779f55f2cc2de08511025f6', + 'wiki.pcd.zip': '66914c99e5531c0484448b84568971362cdad0f6', + 'wiki.pdc.zip': '6ed181fa1f8782917ae7849490c0a5cb0b0b9b29', + 'wiki.pfl.zip': '8d271226af8509962b15a96c4d6e41d9aabd972c', + 'wiki.pih.zip': '365955dbecb17027435fe487ab92a7a267fa25bd', + 'wiki.pi.zip': 'eeb863545392c92cff0f3e3d9c3f61539d3fa1dd', + 'wiki.pl.zip': '2b0cae8af2637bc24b958e6757149d1b9f8c8fea', + 'wiki.pms.zip': '9eff2e96e1cb9bf02adf816c4feb5aa3cd1a384f', + 'wiki.pnb.zip': '23f77d1d9469f5b2c342984288cb3092d53d8dee', + 'wiki.pnt.zip': '84cc9532d2fd7b322bcba91e01ac36c9a719e23a', + 'wiki.ps.zip': '18c9ffb2a81cbc25299b26e35170a29b7de9309c', + 'wiki.pt.zip': '37752109a44829de5ea10b173d7c0cecc0b1a0d7', + 'wiki.qu.zip': '5582c07eeeaec10d9382b3ab90d2921fc97fa2e0', + 'wiki.rmy.zip': 'a106ab536001e92e7a9708417faee9418f4058d0', + 'wiki.rm.zip': '67a324941f2b895a418fbd89314a18bfda19b1de', + 'wiki.rn.zip': 'ce17294909c046e90bb0131632e1d795d1771816', + 'wiki.roa_rup.zip': 'a9a378e90cd46353283c92cfb7d34dd485a018d2', + 'wiki.roa_tara.zip': '953fe4cf1667cbb9b3b8e11666885bfedf74b411', + 'wiki.ro.zip': '6bbb0f9452398416d9183e00e6cd091a02fb351f', + 'wiki.rue.zip': 'e9f9b8ab63c7722b4b68e8c465b1c69436132553', + 'wiki.ru.zip': 'f8f68aa5792941d7750b545e56f1ff5127e88cc2', + 'wiki.rw.zip': '018b9fb76fca5ce7a3e1f266df33fcc1bbc50493', + 'wiki.sah.zip': 'f6c94dbd3b719b154217388310fab72e5a69f823', + 'wiki.sa.zip': '4dc78b48d651056546d14b659c6598770c6bce77', + 'wiki.scn.zip': '218ba35c042cb3e179988bac9acf51cccf37422b', + 'wiki.sco.zip': 'daa8cedbb223e87d48f720aed9ce63dd0c81c632', + 'wiki.sc.zip': '909cc5160cad60fda34ab89c2b87ae4229402eeb', + 'wiki.sd.zip': '5468ed141bf2f1d9b1f8d7b31fee926b496ea9db', + 'wiki.se.zip': '0eb962f8768d88ffcbde3aac833e134a263c2055', + 'wiki.sg.zip': '651035aa74dc2f515253444f48aa9911094f9d27', + 'wiki.sh.zip': 'cf3057b61bd5bca6f47640801681d451aee210cf', + 'wiki.simple.zip': '367737535e39defb0e713a7ff2374cb932c5a9bc', + 'wiki.si.zip': 'cebb2f4011b0d679fe856c5950076e3c48496ecc', + 'wiki.sk.zip': '6c43758d0c0f52351210c558cc33266a65709068', + 'wiki.sl.zip': 'd0239eefc830e5919bef8d9173a884e9e7371e7a', + 'wiki.sm.zip': '2e3cf33f17b449c8f81cc9ea4c84d542cfd23a14', + 'wiki.sn.zip': '4d3844ee350ee0065e5fe910a3f669ef863a2fc9', + 'wiki.so.zip': '9da45db9b21d1f27c4f73152539c1e4fc9b1c49c', + 'wiki.sq.zip': '0db976ec147df49e648cf8256562371d0ae6f2f0', + 'wiki.srn.zip': '120e229d522cc22008c50e0eb74b23d9f6eca51d', + 'wiki.sr.zip': '63b67391158bdd7a642f7d8412771c22e1041744', + 'wiki.ss.zip': '4368f7931f6730a6e8cb9b5794906f2d827582a8', + 'wiki.stq.zip': 'fb1ba577bf6fb7f7fcdc52bf392e63ed8492465d', + 'wiki.st.zip': 'b7e96392b3880c19e210fd42bc72e3f76c07a4c3', + 'wiki.su.zip': '4c4880cfca1ff954c88e44a32f201218eb2be146', + 'wiki.sv.zip': 'e2b10091585f795dd18289c4a65a1da591a78196', + 'wiki.sw.zip': '726631d8998ba1647d040e6b70f4bad7b8d8c367', + 'wiki.szl.zip': 'a70de974cff95cad0443f5faa6c8412c92998100', + 'wiki.ta.zip': '6bafd0bb523f654038393ba191012527745b940b', + 'wiki.tcy.zip': 'b4bd573eaf9fd87300a25648b38a053161d12c39', + 'wiki.tet.zip': '7e5608958977164e544850a5a169f5d55cd47a20', + 'wiki.te.zip': '948e5a6ec13ac95b595c3f52a6e7b9642a56c530', + 'wiki.tg.zip': '5b46429024d6819f6b511a4924b90c958615d40e', + 'wiki.th.zip': 'b8ee0878cec41b4ab1055a17d0ed669de1ed9afd', + 'wiki.ti.zip': 'd55abb74bb3ff195d2293ee9e77886111ee50e52', + 'wiki.tk.zip': '20263f39a31a1d55343f9dea7aecaa2860aefde8', + 'wiki.tl.zip': '2f2b809017249f8c4f8d5eb62979b58f16e8732b', + 'wiki.tn.zip': '0aa11b07b1ad6437bc1e9b6476d51ddd35dad994', + 'wiki.to.zip': '6b90b32ae258a56e67b42736675236b91163b3ad', + 'wiki.tpi.zip': 'ca9591e621ae667a1521d0bb5275435d45e974cc', + 'wiki.tr.zip': '3b6f86c2a115c7adec1b073b1f5624890e680148', + 'wiki.ts.zip': '8a00b16f2881977ad6f8c8665316c27fcab9b842', + 'wiki.tt.zip': '8d2f559bf1e09180d6dc4b127d61815a27670a20', + 'wiki.tum.zip': '5b3f6f3d8cae4d9534cd1fd3afc2f64ec8342b8d', + 'wiki.tw.zip': '7c189fabfcdb2973178c25d35fd10e46ee7148aa', + 'wiki.tyv.zip': '5e3811a19bbf961a5361ac37ff3502287c9ab022', + 'wiki.ty.zip': 'a7f31f8cabf4282533773aa7e63f294315cc85ea', + 'wiki.udm.zip': '643df5ab0914535e46e6839845d0ab585c81a119', + 'wiki.ug.zip': 'a5388269893ac4c7da28b2284f3536ca0f3c9341', + 'wiki.uk.zip': 'fdc9b0a0ab806e5845e9d89b8887ec9d555a0547', + 'wiki.ur.zip': '75579eb5609ea31d79bc2d1bd81d01f48e01bc7c', + 'wiki.uz.zip': 'aa149200f8c6e3e8bb5aa3c67112675d136900b8', + 'wiki.vec.zip': '58c4c9528154e256fbefeb97b8c1675356079f74', + 'wiki.vep.zip': '966b371afcc383058a5fbc6ee8f822620f03feac', + 'wiki.ve.zip': '6450e3ec2c78980c5a41d71ff159aa27918dda75', + 'wiki.vi.zip': 'bfa287fbb358a66b4f9576585df3e46607e1595c', + 'wiki.vls.zip': '7335bfda43890f42e045b8a5de25d1a8629fe012', + 'wiki.vo.zip': 'c2ca18bea165cb1253c1d88fa9958a25088fc84b', + 'wiki.war.zip': '5cda8fdd64e3acf5488ad361b68a63fb23747559', + 'wiki.wa.zip': '2e538c10a0e9f43ea5875c90a8ce01a07c4695a7', + 'wiki.wo.zip': 'f54c65ab63f98ffec7b3fb5bdd51a814034bd673', + 'wiki.wuu.zip': '68d9ad802836737392d62056231bf1b7a58594c9', + 'wiki.xal.zip': 'fb39fed41ccba2e4e58ab7714a53aae3695dbe04', + 'wiki.xh.zip': 'd37caa4d94e66588879231d0826798d8aa4b0a44', + 'wiki.xmf.zip': '956c43bca0d88e9348099cde43d58898e43d9f27', + 'wiki.yi.zip': '151c1670c48e976e4202272b066d7080a8c83615', + 'wiki.yo.zip': 'fdbd0fc6e35bb04c3aef1fa6f0262ba261b11199', + 'wiki.za.zip': '11f6a5dcb49c4d0571d5ac4fb3d7dda1d378fc06', + 'wiki.zea.zip': '22159a722c5c0390bad9206eb75e6e166efe38e9', + 'wiki.zh_classical.zip': 'c689d61d2254caf1ecec0909249523b09a737717', + 'wiki.zh_min_nan.zip': '0516a413565484d924a4c8b50c690d39344cdb64', + 'wiki.zh_yue.zip': '464f4c1c2039194cbae7502ed3a2eeff4df9e34f', + 'wiki.zh.zip': '2374ec566f6411b9bb570077636695fe9768a5ba', + 'wiki.zu.zip': 'a6d0325dab37cd551e6d7f6c783dd13f4c71db2f'} + +FAST_TEXT_FILE_SHA1 = \ + {'crawl-300d-2M.vec': '9b556504d099a6c01f3dd76b88775d02cb2f1946', + 'wiki.aa.vec': '5cce30fc85471572c498f278bbe495184577363e', + 'wiki.ab.vec': '9d89a403a9a866d3da8dd8cfab849f59ee499343', + 'wiki.ace.vec': '85d00074f7a08626f39da6a0c8a5cfa250096ab9', + 'wiki.ady.vec': '9d17d74f0348224cdebf8a831e61af0825f8952d', + 'wiki.af.vec': '999e64bcd8dab8de42cb1feceeca360def35324d', + 'wiki.ak.vec': '6092b8af335c2dc93e8df2bbf1d715f01e637bb4', + 'wiki.als.vec': '96052e96870695cca50857b5fde5f9f42219139a', + 'wiki.am.vec': 'dff7fcdd8f5ba0638ab9e1758a89800766156d72', + 'wiki.ang.vec': 'a7c30e02422d97d23a0701279c5c1c03159130a5', + 'wiki.an.vec': '5b4c2b1de5c04e4e0be83841410ca84c47305d21', + 'wiki.arc.vec': 'fd3ad743103f80cde9cfc048d7ca509e50efb35a', + 'wiki.ar.vec': 'c46e2142f799cc385bd25f0c0a8943ca565505a4', + 'wiki.arz.vec': '5e904087043b91f4945dd708f4230fdf51360132', + 'wiki.ast.vec': '89a90357101953b7c292697fd050c00fe5c38ac5', + 'wiki.as.vec': 'cad5883b5147cbe6cdbf604f65cabdb675a59258', + 'wiki.av.vec': '99976a63ca8c4231f808fd4314f0433db35e290d', + 'wiki.ay.vec': 'be359dad25b2c742d3abfa94c5f5db13f86c730e', + 'wiki.azb.vec': 'e23af0a436b97434813c3cb14ed114cc5b352faa', + 'wiki.az.vec': '9581d55d9056ad398a153c37b502f3a07867d091', + 'wiki.bar.vec': '96130f1f2e5bffdd06c202ad4472e5234020980a', + 'wiki.bat_smg.vec': 'cb3aef58da2011183b39fca64cabf3d9d7a62f4b', + 'wiki.ba.vec': '22147ee16b2d163cc88d09a035264fd0c10dab68', + 'wiki.bcl.vec': 'd4117b5c443438ddfa608b10a5be2c2501817e7e', + 'wiki.be.vec': '6cf81322cd7b046a7f02ec4c4960ad27045383fa', + 'wiki.bg.vec': '7c1cc6d0c52b038e4b7173259b0c009f242cf486', + 'wiki.bh.vec': 'ab2d29017afa015c49566a6d9bf75393c23ac4c0', + 'wiki.bi.vec': '15785220cd6e6c86cc87e7d3f3322a5541a4fe5d', + 'wiki.bjn.vec': '5f134cf288e8042dcd048a3ee76159aab42c7288', + 'wiki.bm.vec': 'f36a19c95e90865f6518d4487e59f363b47bd865', + 'wiki.bn.vec': '6fc3bfd9af455719f55bee0bea31b11afc70cf06', + 'wiki.bo.vec': '2e9358e03dcfa09da23d2e1499d84b10348fd8a9', + 'wiki.bpy.vec': 'c2bb15487c4bdb8fa869772694300ae1fee73896', + 'wiki.br.vec': 'df44e16abd2017e2a1b6c6588ee02779b19907f6', + 'wiki.bs.vec': 'c4943a290819ceae1611dd11179b40aab0df0471', + 'wiki.bug.vec': '942d8f7dadde5faa33aa72862501434f48e29f60', + 'wiki.bxr.vec': 'eaf767690c6b194605ae778719212e3874873d4c', + 'wiki.ca.vec': 'f5971edee11c939f6a7accfd33a9a45caa54141a', + 'wiki.cbk_zam.vec': '6fef47b4559eec402ce371de20dfb018acd6347d', + 'wiki.cdo.vec': '95e8196bf76323dbabab1b8a49ba4d677af3ccea', + 'wiki.ceb.vec': 'b8516a55537b8f80c927d77d95cdf7e4ff849a05', + 'wiki.ce.vec': '1d94b0168a773895b23889f7f07d7cf56c11a360', + 'wiki.cho.vec': 'cec6778f025fa9ae4134046c6c3a6291bd9c63f9', + 'wiki.chr.vec': '8501bf86b41074ed6c8d15b9209ef7ce83122e70', + 'wiki.ch.vec': '46803f3a1734f6a7b0d8cb053bbb86a6915d02e9', + 'wiki.chy.vec': '26c87688551ffe3a0c7a5952e894306651e62131', + 'wiki.ckb.vec': 'adb2fef309f1d93f429442b9c16c1564192c58f3', + 'wiki.co.vec': 'af876a918594e5541207bc12f17bfc4268df7b93', + 'wiki.crh.vec': 'c0d2310a1207fcacc94b25b149420b33bf835015', + 'wiki.cr.vec': '61dd9f044b7dfa56dcf1c3c07c7504c569420528', + 'wiki.csb.vec': '649cb2692f08414987c875dc331022567d367497', + 'wiki.cs.vec': 'f3ec1502aeee6a550d8cf784273fa62f61419a4e', + 'wiki.cu.vec': 'ddadb14ea00ea1dda716ee33732497ec049b526f', + 'wiki.cv.vec': '9cdb0bee5a0fea030def85597dba7108f21b0424', + 'wiki.cy.vec': '32d976a9bfc4dd6e39328c906eead0f597bd9e25', + 'wiki.da.vec': '526947dab1ffbc1465c7a766f2bca4de50676b08', + 'wiki.de.vec': '2ed2696afe55f023b0040b238d9a47e5fedfe48b', + 'wiki.diq.vec': '77f3c370d1d77806fafe368cf788af550ff607dd', + 'wiki.dsb.vec': 'e49a647a441fbf011ac5411dd6005e8725b9a65d', + 'wiki.dv.vec': 'e135ba97c711a021bc3317db2b95db5212c17658', + 'wiki.dz.vec': '24888f0b2cd156360bfb5e9e905240163ba798d8', + 'wiki.ee.vec': 'afd1670655daa7ffba51187a415fdd0b43f1d487', + 'wiki.el.vec': '6f034271390feaa6f9d7d16f933ddef637755979', + 'wiki.eml.vec': 'de6be7a2ffdda226eec730dd54b4c614bd7f5dca', + 'wiki.en.vec': 'c1e418f144ceb332b4328d27addf508731fa87df', + 'wiki.eo.vec': 'b56998fd69f66755b722a9481a9bdaf10f62c9aa', + 'wiki.es.vec': '2f41401aa0925167176bcd7a6770423d891dfef5', + 'wiki.et.vec': '64d56b66c02d5e49b1b66a85854d67d2dd9ebd41', + 'wiki.eu.vec': '5e72f4ef93666971fea5d2180b354e0a0821ba91', + 'wiki.ext.vec': '456c5632b13a0f136cd180ebe2dda67b83f78397', + 'wiki.fa.vec': '09b6cc685c895c66b853af9617787d3ab0891e2c', + 'wiki.ff.vec': '12b09d695f5fb8de4b5da9d36a73eb178b293a04', + 'wiki.fiu_vro.vec': '168a71a2b1c478e6810fa5dce9612d8bf8a273dc', + 'wiki.fi.vec': '91d19baae994d7e556b5b5938be2dc6013f9c706', + 'wiki.fj.vec': '36d36dc14001a109926bfc633594f6a2f7401697', + 'wiki.fo.vec': 'eead8ddc7bb74b12b16784723abf802bb51f844d', + 'wiki.frp.vec': '0eb70a613ccf807c7308c1f62535f0606465029d', + 'wiki.frr.vec': 'cde62af939cb2de35e341cef2c74813802a58ed4', + 'wiki.fr.vec': 'b092229005a65d8683a4112852fe6eb8161a6917', + 'wiki.fur.vec': 'd4a595cffa1abcdcf4229ba15277179ce5d20bc6', + 'wiki.fy.vec': 'd4beef537b7ff142a3986513879ff51a9ec14a7b', + 'wiki.gag.vec': 'c82ec7a5d081f0673661824f4fc34345dee255f0', + 'wiki.gan.vec': '7e53a33b7bd5b0360ea4cb452145616c09445029', + 'wiki.ga.vec': 'caaa5b2167a499893313ac1aa38416a6a0fe9a24', + 'wiki.gd.vec': 'f4b513598a1bf0f0d5b6521ea8ce363e9596cb97', + 'wiki.glk.vec': '20a7759075916e10531f5b3577302353cef565cd', + 'wiki.gl.vec': '8888bb8f3d70b36729b9ae479fe3765e0c083862', + 'wiki.gn.vec': '98594af7897c5a1f35885ddecc77556a7e7ae981', + 'wiki.gom.vec': '5a1193d9e5d49d06354c14e2b7c01bea176e13f1', + 'wiki.got.vec': 'dfa06de83a0e3099027c57b84561d7d990ea8310', + 'wiki.gu.vec': 'f9e13452eb63d92bea44c7c3db8fba9945c7000e', + 'wiki.gv.vec': '993a7ee31bdacc91763dad656aa6c2947b873473', + 'wiki.hak.vec': '9e83512d34c7f81739492bf0abbb25ff1ef88573', + 'wiki.ha.vec': '677a24efeeb1bcb8c0a931407775f18b18e875ae', + 'wiki.haw.vec': '58fea5aa1b37723797d26fb3d050ce6176757240', + 'wiki.he.vec': '55534560247394669e3f5c169136770c93bc2708', + 'wiki.hif.vec': '49697cf784814d3f1a47559724028e0fc0940d36', + 'wiki.hi.vec': '8049bb8604bc049d48bd934e27b0e184c480a413', + 'wiki.ho.vec': '9c75a09e099213aa8cd1f1020b223427537cbdd8', + 'wiki.hr.vec': '0c96f9af092cf8a84b03aec1426cd23921671489', + 'wiki.hsb.vec': '3dc7830544c58535bed308c552d609e13b973502', + 'wiki.ht.vec': '5039dfb58a074ac046813f2dae81159be8c5213f', + 'wiki.hu.vec': 'cd777e9efca3d4bd97c89f01690cfa4840d9c46f', + 'wiki.hy.vec': '21f9259d04cfd22db446a45d3622af225f00cf20', + 'wiki.hz.vec': '2a94b1390d68027748a05169fbc0c11a9a183456', + 'wiki.ia.vec': '2a348dc924638efc20c34785852b0837364aed76', + 'wiki.id.vec': 'c49d5c9bec89114599427f6c12a5bda2e5523dfd', + 'wiki.ie.vec': '01b0d11c0e7397418e73853d220e97bdcf7a8961', + 'wiki.ig.vec': 'd2d1643b4fb1a18a4d002cf2969073f7f201b3b2', + 'wiki.ii.vec': '41c6cd68b3ebe4ece2a06c37b06dca5d07c9fb3a', + 'wiki.ik.vec': 'af31cbec7b839f50fa70553ec63c58f7067d3ea8', + 'wiki.ilo.vec': 'c0e43835a3f4e0033ea5d7c6ff189982b2f26a05', + 'wiki.io.vec': 'af0c480c5872bff31d82e767c1116da2a6be0c00', + 'wiki.is.vec': 'ae0b018f92b3e218f2dacb2045a8f0a0446788a5', + 'wiki.it.vec': 'ac4a985e85ffae48047034e2603d804bf126caa9', + 'wiki.iu.vec': '5d51b2ba215005216ae003f4a6d6ef39fb30ca2e', + 'wiki.jam.vec': '6d51e384c56330097c2531fdbf4e74418909e388', + 'wiki.ja.vec': '7a2b1af1e46d795410692a002e40fa3085135f69', + 'wiki.jbo.vec': 'c90481946aa4b6b304528292612ae620f6549f3e', + 'wiki.jv.vec': '2ff7927d3ff04b8208133497b3778ede00ea463f', + 'wiki.kaa.vec': 'd990d3b9bd511d2d630f923099a6b9110231b2ed', + 'wiki.kab.vec': 'e3b73d41267d8d4cd42f6cc5a0c05dc4e021bf74', + 'wiki.ka.vec': '8b92b73f27f9b77818211e053a33985589de7c62', + 'wiki.kbd.vec': 'f5b8dbe47a7fae702232b5680b070ef6e865539e', + 'wiki.kg.vec': '1550647b6059e6eb649b100e31c53bd0661117b2', + 'wiki.ki.vec': 'c4e373e2ea13f7fa1e95b0733365e4b3fc8b2cc8', + 'wiki.kj.vec': 'c27e563683f9c96ff6f680a6d6bb9e9e2f9960d0', + 'wiki.kk.vec': '6343b2b31bad2e13d03a110b91c38fab4adc01cd', + 'wiki.kl.vec': 'e5def7fb1b56c5956b6e951e912d53ba0ff089f8', + 'wiki.km.vec': '64f7fff1df90b1f7241b232e901f76223a3719e0', + 'wiki.kn.vec': '32763f4f860f0d081f3aabf3e7d17b7858e7d877', + 'wiki.koi.vec': '4001f0617fe0fdd3b22116b304f497b7b16c6e4c', + 'wiki.ko.vec': '042c85a788c2778cca538cf716b8a78f0d7fa823', + 'wiki.krc.vec': '0c6ef043d51e5f337a309804f1db180fa0bb2cb8', + 'wiki.kr.vec': '25d5b4d5911a819c48328c48fb346417d07d4070', + 'wiki.ksh.vec': '4c3bb4f12073532b6fb7cc6c2be5e53319ef5b65', + 'wiki.ks.vec': '5056a87c4ee2d8bf0792436fc6b2b61648014de9', + 'wiki.ku.vec': '4d3a2401527dd9ba6be2b0cd31f6cd3edebadce9', + 'wiki.kv.vec': '164dc44d701b9d606a45f0b0446076adc3858dca', + 'wiki.kw.vec': 'f9eaa35a7e4f077f6de85c7801f74582f91b52c1', + 'wiki.ky.vec': '13b0ae3f23822317a0243bd9182105c631c834b3', + 'wiki.lad.vec': 'c510e520cde97050bf1cbeb36f2b90e6348ceed4', + 'wiki.la.vec': '9ea6286a0581084533db8d6ee96e0b7d15166543', + 'wiki.lbe.vec': '283619d93255571f14fd4545bb0577979171b990', + 'wiki.lb.vec': 'b146f23628c84e64314a35a5b6cc65a33777e22d', + 'wiki.lez.vec': '8e579b984a500ad89fc66767bfd7319766bd669b', + 'wiki.lg.vec': 'b096f5248dfbb343dc4696c97ea253510e1c4ef9', + 'wiki.lij.vec': '4ff5bb405c820e4119f0636efc301da15a08c00a', + 'wiki.li.vec': '0fb9ec4ac93676d8ef651692062bc3d7f6ae0843', + 'wiki.lmo.vec': 'a89414d9ceee4823622258f18936f67faf7e06e7', + 'wiki.ln.vec': '70b6a286b42958e25cb80824e0d8f1aee2de6dde', + 'wiki.lo.vec': '7c83f82b80c49b8eab21f62ecdb3681b8bda40a6', + 'wiki.lrc.vec': 'c1ae4fb79a19d44bfe8f601f0a30fbec841fa612', + 'wiki.ltg.vec': 'ec2f13d1290bd54afcaa74569e66e43e9bfef264', + 'wiki.lt.vec': '58d3ebef24e5e31be1a8318b45c08ebb16ad775a', + 'wiki.lv.vec': 'ef6b549f96e22718f513d47a611d3d6bc001a164', + 'wiki.mai.vec': '7f513ff36e485b19f91f83b30c32dd82e9e497f6', + 'wiki.map_bms.vec': 'e7deab5fdd38fa3331b1bcb4a16432b38c512e21', + 'wiki.mdf.vec': 'b16099ce0283a241339716eac41cfd99fdea7f36', + 'wiki.mg.vec': '0808252740909d6129f672584311263e7b2adadc', + 'wiki.mhr.vec': '39f62e292336cabc364f0d1913540b881b406393', + 'wiki.mh.vec': '7d2d8bff722fe0a5d869d9da11792a406aff3dc3', + 'wiki.min.vec': '3bb0fa596cf27a1d165c55684bebdc8d40cb8ad7', + 'wiki.mi.vec': 'e8acf9c7c2ab840a192c563aa776201a88e4ca89', + 'wiki.mk.vec': '85a3d3f13fa88ffde023d2326c65bdded4983dff', + 'wiki.ml.vec': '2b70fe76e8cf199a18551de782784a21e8db0b66', + 'wiki.mn.vec': '7cef7ecdf9d98484d9b598b25d0e717dba6acfd9', + 'wiki.mo.vec': 'cc54b661aefabdf516b49d24acb51273b3acf210', + 'wiki.mrj.vec': 'aa1c1ecba1ffd6b42c8d9659a8a04ab328ae1650', + 'wiki.mr.vec': '2cd6cf88bfdfb24850d345749ce0cfea8d65829e', + 'wiki.ms.vec': '458e1a079799a54cdc0a7b78c7fa1729d2683a6d', + 'wiki.mt.vec': '81f4c1d84dd4cc4276d59cb903fcc9aba46be981', + 'wiki.multi.ar.vec': 'f1f12cc9d629382af574a3db74fe49c2fd615c8f', + 'wiki.multi.bg.vec': '22470e664e4b35761a33c64433ea2f0c12140673', + 'wiki.multi.ca.vec': 'bc8d98b4d86d740d1985d73d211d887d561bcdd7', + 'wiki.multi.cs.vec': '17358b62e63f96b0479d6a70e9235a0421493884', + 'wiki.multi.da.vec': 'ebc75f428714d26fb1fa31accce49ad3b31e273b', + 'wiki.multi.de.vec': 'b9a63406aedf4446b467b94d12674bfe4723b52d', + 'wiki.multi.el.vec': '03d33db85bf83f35b943ce93b18c02fa98a0bc05', + 'wiki.multi.en.vec': '696719afdbe470ee4a2eb668229486dba1df19cc', + 'wiki.multi.es.vec': '98c9e35564ec57fee5dbc6155890150452f45d3f', + 'wiki.multi.et.vec': 'db10189093387e853f2fd3978770e1cc7bc07820', + 'wiki.multi.fi.vec': '746916885a1c7d4ec3f139a32cf267f9e15f5363', + 'wiki.multi.fr.vec': 'fe1535827b631d934beb02f8d36ba901b2c94a46', + 'wiki.multi.he.vec': '6dd112f018165317da22971a2b6fdb2a15dafa91', + 'wiki.multi.hr.vec': 'ff9f23cf595ec8dd93cd93c6b48049730c34253b', + 'wiki.multi.hu.vec': '6da405c9b048f3cbb990bfb29ef149f0430aa2e7', + 'wiki.multi.id.vec': '34edadab182682198c37ade8538530c545635742', + 'wiki.multi.it.vec': 'c55802bd73d46a6fc86771097670e02a70b5d46d', + 'wiki.multi.mk.vec': 'cec8550503ebca0bdc7ad11f2c15085b7072a990', + 'wiki.multi.nl.vec': 'c3f45a5fe8a8bc213cdf35dce51651b752ca60c4', + 'wiki.multi.no.vec': '105236df530c8fc2ce5b1e2550a2059bbc46fc28', + 'wiki.multi.pl.vec': '676eb5acb22982c0c9a7d6e4c90d26730c6d120e', + 'wiki.multi.pt.vec': '625b0a5384873c79a5dcfff5ee3fde49a3a65013', + 'wiki.multi.ro.vec': '82bd59674509b69f988f9870e3a291836ba43e84', + 'wiki.multi.ru.vec': 'a7d9c5f2ab2abb448a5111d352caa921adabe830', + 'wiki.multi.sk.vec': '98d849ee77f0320472cc5afa002bfde129be7089', + 'wiki.multi.sl.vec': 'fb5cfb8a9c44380d74fb21ddd204e820c4e05c31', + 'wiki.multi.sv.vec': '95d6cc3ba23dffff9be6adb467b617dd57780cb2', + 'wiki.multi.tr.vec': 'ecb0e353eaccba3fcacc6994d93065934ef429e9', + 'wiki.multi.uk.vec': '35f4f5a1ead8bd66bcaf865021fc3aae94456ab6', + 'wiki.multi.vi.vec': 'b1abe06360e1d65a0db65dd41ead7b2f9d651ea0', + 'wiki.mus.vec': 'fa1066f7bd09df4589993ca498c19aeb6cf986fd', + 'wiki.mwl.vec': '3d10a218242b94fcc3981aa3beb012b701827a55', + 'wiki.my.vec': 'e7c7989e32b23ca1a9caf534cc65ecaf9e1b9112', + 'wiki.myv.vec': '7de0927fd3d65677de7f770b3bd57c73b58df85d', + 'wiki.mzn.vec': 'aefad49237808acab99e1ca8eeaaf531666f261d', + 'wiki.nah.vec': 'c52e01cf4479fb7ec91ef39f298e8f97aeb6496e', + 'wiki.nap.vec': '6c9bd8ce1e85ee679b25189fd6f6d36afb119b6c', + 'wiki.na.vec': '8a592eb3dbe5693372714dff495d01cabc3ea215', + 'wiki.nds_nl.vec': '1cd96d12e78e5cd3f65ca2773a17696bda387b9f', + 'wiki.nds.vec': '7bf293149c08226e05bcf0442ac6e601162b9ffd', + 'wiki.ne.vec': '1045d7876f947cd4602d9ca79f7c4323a5d3a52d', + 'wiki-news-300d-1M-subword.vec': '717a3058e0ba5ef3cde52c3df0d4f0f60b0a113a', + 'wiki-news-300d-1M.vec': '11cac9efe6f599e659be182f5766d6fbd5b1cab9', + 'wiki.new.vec': '51f6c0b4ef1aee9fad4ab1cb69a7479db35e39a5', + 'wiki.ng.vec': 'c3016cc07d40bd43bea84b7c600244ff3d2a928e', + 'wiki.nl.vec': 'd796ee27e37b7d1d464e03c265c31ab62b52533e', + 'wiki.nn.vec': '35aeab89ffeca0377accbbd3bf18b81913c75448', + 'wiki.no.vec': 'd52e8019d7cc48569c8c3b514d2b1bd10261b5c0', + 'wiki.nov.vec': '5455c6e8463b1c43dd073e3e177702fb9a1dd834', + 'wiki.nrm.vec': 'b4cb941b126b26fa045c5fc75a490a31a969101c', + 'wiki.nso.vec': 'a906271509c2b343df35d1471509492bbfa883aa', + 'wiki.nv.vec': 'f5a6ea213bfe95c82cb22b53b4965df8b67ffeab', + 'wiki.ny.vec': '3aec3dcaea6c35f8254c407621644f87df37e411', + 'wiki.oc.vec': 'cc1833492899d75571148c2c305591f53d63f0b1', + 'wiki.olo.vec': 'cbadb4cada4dc579d0becdac93dfb479d76bf6c8', + 'wiki.om.vec': '91789a8d9f9284f7e71e4bb8d9a60eae4af4adca', + 'wiki.or.vec': 'a6b120fe536b6c0133b077dca0043c3bc97eef0b', + 'wiki.os.vec': '791b26cc300e9a1f0a08c7b2213a264e41ce30d6', + 'wiki.pag.vec': '03f71faf060c4eb33802275279967349c0337553', + 'wiki.pam.vec': '8fbd31e70d0ca0c61eb1a152efaa8ecb29180967', + 'wiki.pap.vec': '8cd98267cc55a4f9de80212e29651ddf7a9e83fd', + 'wiki.pa.vec': '4939d0db77a5b28d7d5aab0fab4f999d93b2053e', + 'wiki.pcd.vec': 'd2e8e7321b6f1bce94c563cb8ef8af2b45cc3e48', + 'wiki.pdc.vec': '401e24d0fb9b0ae9e06a5c700684361f58727fcf', + 'wiki.pfl.vec': '0ad9b7f3ae13f909f12835107432fee4c4ed3031', + 'wiki.pih.vec': '4ae6ef2a9c6c88e9322eda900e0f58be5592a29b', + 'wiki.pi.vec': 'd388db284357042f4260e1a567cb489b05bb8e0b', + 'wiki.pl.vec': 'd031adb6f83eda0364a861dcbf5ef779b5951c0b', + 'wiki.pms.vec': 'e30bda8d33d61db43243c157b9ac2feeaff316c8', + 'wiki.pnb.vec': '35f38862d3d83012d6db7baa8a4105e3e0a416e7', + 'wiki.pnt.vec': '38134772012d68f247e34daf220d9d4ed3e7f489', + 'wiki.ps.vec': '64f1bec5d5b937289199ceae2e1da6557ce48852', + 'wiki.pt.vec': '7f11ebdb0cbf5929b38319f1e977d2c13bcd741b', + 'wiki.qu.vec': '58de8c8290e8bc8f2a6a677312e28457113437b2', + 'wiki.rm.vec': '5d3144b47a0dd98648a6df0636384ab2a010ad7b', + 'wiki.rmy.vec': '3d36d3485961900c23355a0f7c2ba656a8558c29', + 'wiki.rn.vec': '80b6171b78dd932f59f70dbef074abb906af4eee', + 'wiki.roa_rup.vec': 'e31a44353cd84b976586c8df35a2ab58318120f0', + 'wiki.roa_tara.vec': 'b3fcb01ff0bac53a0ba08c5c0c411f26ee83a95a', + 'wiki.ro.vec': 'c088ea2752d5ec8b42e32410c191a14839ae8a1f', + 'wiki.rue.vec': 'fe539e0ea0bbbfd3ee06bd0c5521a035c7361ec5', + 'wiki.ru.vec': '7514a2c60ee4118abb451ed32a0d61cb52dec384', + 'wiki.rw.vec': 'af2ec410da6519a86ba21004c8b4c7fde768a91c', + 'wiki.sah.vec': '202470467194a1cbdcd571b14ef68371a29b38d9', + 'wiki.sa.vec': '7fed78d1d7674453b9876ee99aeeeba85ea46699', + 'wiki.scn.vec': 'bde043a235551e1643506774c5d9b61ecf2fc424', + 'wiki.sco.vec': '4625a5ad90a57f994be9b3aa4f8f3ecda941a821', + 'wiki.sc.vec': 'dba8dc7754ef04b1ba0cd702d94eea9575cde91c', + 'wiki.sd.vec': '36852d1253496e598fbd9b9009f07f454a6bea5b', + 'wiki.se.vec': 'f46b35ee6b893c2f12dd1b929bbc2b8120cbcd8d', + 'wiki.sg.vec': '90ece136bef7ad6e4e97776a1c7238499544405d', + 'wiki.sh.vec': '016691ecb26ace442731d92b1265e5c6c3d8ca5f', + 'wiki.simple.vec': '55267c50fbdf4e4ae0fbbda5c73830a379d68795', + 'wiki.si.vec': 'd05ed6a0bc1ee56e5d2e5f881d47372095f6eb0c', + 'wiki.sk.vec': '98759aacf7352d49a51390fae02030776510ae13', + 'wiki.sl.vec': 'b26997c0ed1de26a47b11efdc26ac1e7f189fa54', + 'wiki.sm.vec': '88c2c57ca483626b052403418cb4372d72352bc9', + 'wiki.sn.vec': '8dbb1019dcc8f842a8c0f550295ae697f8e1b7e0', + 'wiki.so.vec': '294756b60b03fe57cb08abd8d677d6a717b40bc8', + 'wiki.sq.vec': 'd07ffed553f5eb4756d0a1548a7ba9a51a52f7c6', + 'wiki.srn.vec': 'faee05e550f5b08809a9ae5586ac4b08c9a1c359', + 'wiki.sr.vec': '3cf09f476f55a92fdd2880f7ba336656ab232736', + 'wiki.ss.vec': '488546a3b2f88f549c50ae9f32f1997cc441b039', + 'wiki.stq.vec': '1bf88af29f1d86cac16042a5bea6b1651c96a8c1', + 'wiki.st.vec': '963646055d12873b1c83b0eef8649ecaf473d42e', + 'wiki.su.vec': '25e864495acb6d280bab0e62480f68550c9ceed4', + 'wiki.sv.vec': 'eab83ae36701139696477b91b6e8d292ef175053', + 'wiki.sw.vec': '8e70d207dbbd14e60a48e260a23fbf284a8e9f06', + 'wiki.szl.vec': '0573cf888ec70b459b0596d34814fe60fd69f190', + 'wiki.ta.vec': 'b66b5358527b1f3a6a421ab26464a3c1e75e18af', + 'wiki.tcy.vec': '388b1d89642fcc790b688e9643b3d19e14d66f40', + 'wiki.tet.vec': 'f38fe0e76b9b08ff652689eeee42c4fdadd9a47e', + 'wiki.te.vec': 'e71dcf3cc45da1bcdae5e431324025bd2026d0c8', + 'wiki.tg.vec': '6a5cd5bfe571ca0359b66d21bf6950553213f42d', + 'wiki.th.vec': '1d6e0d525392a1042d017534f6c320c5a0afd345', + 'wiki.ti.vec': 'c769fbc99bbb4138a40231e573685c7948d4a4c4', + 'wiki.tk.vec': '33ae577f77d339ab7a0dff88855b8d5c974d0aef', + 'wiki.tl.vec': 'd508e229ced7201510999e76d583de3ff2339d8b', + 'wiki.tn.vec': '39f45f3fa86645bb25c54150204abcd51cc1048c', + 'wiki.to.vec': '64d512665b55e9ef9a3915e8167347be79310fa0', + 'wiki.tpi.vec': '407b96d235f54f3e0be9dc23a3bab89c6593a621', + 'wiki.tr.vec': '13234aa1bf5f99e81d933482b3b83c3e4bf6c85e', + 'wiki.ts.vec': '00f8229e2f230afd388221c0f823a1de9fc0e443', + 'wiki.tt.vec': '913bb3a11da6f8142b3bbec3ef065162d9350f1d', + 'wiki.tum.vec': 'bfbe43364724af882a520d2edcc2ce049c7357cd', + 'wiki.tw.vec': 'f329b667d70d9f0b753e55e1b1579b5a5191d3bd', + 'wiki.ty.vec': 'b881f60b8c75a71864d9847a17961d368f3058fc', + 'wiki.tyv.vec': 'e8f9a36dc58e4108c553f96e247a877a099ab5ba', + 'wiki.udm.vec': '336a8526f22e177faac69573661dc9c3ce36591f', + 'wiki.ug.vec': '586d2febafaf17c9187c599ffd7b96e559103c34', + 'wiki.uk.vec': '77f7737b9f88eac2b3e130ea8abb8886336fd0c6', + 'wiki.ur.vec': 'cb8132102152a958df72bd3e25f1a72abb4c9c76', + 'wiki.uz.vec': '11c3a76dae12b454f693811e33ae2e60015743e2', + 'wiki.vec.vec': 'ae4b055fba21974e56beecab3a95f9dc24a62fd0', + 'wiki.vep.vec': 'a38a781fde24f4d7b52aa8bc450b9949dd4e1808', + 'wiki.ve.vec': 'b7d2947501de1c30a9f8496d5efae20c051104e1', + 'wiki.vi.vec': 'bc84245b52b2e212e28dc6856c0693ce9845a9c5', + 'wiki.vls.vec': '07e8636908c057b9870ce4b98c7130d460cf882a', + 'wiki.vo.vec': 'c830988b6965bfce2f932b1be193f7d1f755f411', + 'wiki.war.vec': '1f5d443d6f612b59a53820dd6f39fd886a6ad30f', + 'wiki.wa.vec': '18f9ca1a585e1d18c3630029141a2e19d7d34a8e', + 'wiki.wo.vec': '2ad96a7a9e640bc0dbcf316b1f414b92802dcb8e', + 'wiki.wuu.vec': 'e1cbae1d3ad52329d0f36ada764016fbacf07049', + 'wiki.xal.vec': 'b738222d84cb8c8fdb2b30a7219aa5d3bdc2f61c', + 'wiki.xh.vec': 'bf37f741b0b75953281d11df2b4d80100df9e666', + 'wiki.xmf.vec': 'dc1923cfd1a7002d5d60426b60e6756854ab4a14', + 'wiki.yi.vec': '299d61958b7dcc38774768f1489121384726d860', + 'wiki.yo.vec': 'e35c8aff2924ba07936be9d0d94bd298f09702a4', + 'wiki.za.vec': 'e3a0e58bd2e5b1891c71f1f7e37ff71997a20361', + 'wiki.zea.vec': 'ee12db26aab3f2b3b2745a298ef414e7aeb5a058', + 'wiki.zh_classical.vec': '840981c83dd8e5cb02d1cd695e2fe0870941316c', + 'wiki.zh_min_nan.vec': 'f91ccb013e200bb7ed560082ddf4bdd9c2f315bb', + 'wiki.zh.vec': '117ab34faa80e381641fbabf3a24bc8cfba44050', + 'wiki.zh_yue.vec': 'd2ac1ab9eb1a908797644f83f259c90cb3c1a350', + 'wiki.zu.vec': '4b244b9697a8280e6646842c5fc81bb3a6bc8ec7'} diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py new file mode 100644 index 000000000000..d40554a97e06 --- /dev/null +++ b/python/mxnet/text/embedding.py @@ -0,0 +1,553 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=consider-iterating-dictionary +# pylint: disable=super-init-not-called + +"""Text token embedding.""" +from __future__ import absolute_import +from __future__ import print_function + +import io +import logging +import os +import tarfile +import warnings +import zipfile + +from . import _constants as C +from .. import ndarray as nd +from .. import registry +from ..gluon.utils import check_sha1, download, _get_repo_file_url + + +def register(embedding_cls): + """Registers a new token embedding. + Once an embedding is registered, we can create an instance of this embedding with + :func:`~mxnet.contrib.text.embedding.create`. + Examples + -------- + >>> @mxnet.contrib.text.embedding.register + ... class MyTextEmbed(mxnet.contrib.text.embedding.TokenEmbedding): + ... def __init__(self, pretrained_file_name='my_pretrain_file'): + ... pass + >>> embed = mxnet.contrib.text.embedding.create('MyTokenEmbed') + >>> print(type(embed)) + + """ + + register_text_embedding = registry.get_register_func(TokenEmbedding, 'token embedding') + return register_text_embedding(embedding_cls) + + +def create(embedding_name, **kwargs): + """Creates an instance of token embedding. + Creates a token embedding instance by loading embedding vectors from an externally hosted + pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid + `embedding_name` and `pretrained_file_name`, use + `mxnet.contrib.text.embedding.get_pretrained_file_names()`. + Parameters + ---------- + embedding_name : str + The token embedding name (case-insensitive). + Returns + ------- + An instance of `mxnet.contrib.text.glossary.TokenEmbedding`: + A token embedding instance that loads embedding vectors from an externally hosted + pre-trained token embedding file. + """ + + create_text_embedding = registry.get_create_func(TokenEmbedding, 'token embedding') + return create_text_embedding(embedding_name, **kwargs) + + +def get_pretrained_file_names(embedding_name=None): + """Get valid token embedding names and their pre-trained file names. + To load token embedding vectors from an externally hosted pre-trained token embedding file, + such as those of GloVe and FastText, one should use + `mxnet.contrib.text.embedding.create(embedding_name, pretrained_file_name)`. + This method returns all the valid names of `pretrained_file_name` for the specified + `embedding_name`. If `embedding_name` is set to None, this method returns all the valid + names of `embedding_name` with their associated `pretrained_file_name`. + Parameters + ---------- + embedding_name : str or None, default None + The pre-trained token embedding name. + Returns + ------- + dict or list: + A list of all the valid pre-trained token embedding file names (`pretrained_file_name`) + for the specified token embedding name (`embedding_name`). If the text embeding name is + set to None, returns a dict mapping each valid token embedding name to a list of valid + pre-trained files (`pretrained_file_name`). They can be plugged into + `mxnet.contrib.text.embedding.create(embedding_name, + pretrained_file_name)`. + """ + + text_embedding_reg = registry.get_registry(TokenEmbedding) + + if embedding_name is not None: + if embedding_name not in text_embedding_reg: + raise KeyError('Cannot find `embedding_name` %s. Use ' + '`get_pretrained_file_names(' + 'embedding_name=None).keys()` to get all the valid embedding ' + 'names.' % embedding_name) + return list(text_embedding_reg[embedding_name].pretrained_file_name_sha1.keys()) + else: + return {embedding_name: list(embedding_cls.pretrained_file_name_sha1.keys()) + for embedding_name, embedding_cls in registry.get_registry(TokenEmbedding).items()} + + +class TokenEmbedding(object): + """Token embedding base class. + + + To load token embedding from an externally hosted pre-trained token embedding file, such as + those of GloVe and FastText, use + :func:`~mxnet.contrib.text.embedding.create(embedding_name, pretrained_file_name)`. + To get all the available `embedding_name` and `pretrained_file_name`, use + :func:`~mxnet.contrib.text.embedding.get_pretrained_file_names()`. + Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use + :class:`~mxnet.contrib.text.embedding.CustomEmbedding`. + For every unknown token, if its representation `self.unknown_token` is encountered in the + pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token + embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the + token embedding vector initialized by `init_unknown_vec`. + If a token is encountered multiple times in the pre-trained token embedding file, only the + first-encountered token embedding vector will be loaded and the rest will be skipped. + Parameters + ---------- + unknown_token : hashable object, default '' + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + reserved_tokens : list of strs or None, default None + A list of reserved tokens that will always be indexed. + Properties + ---------- + token_to_idx : dict mapping str to int + A dict mapping each token to its index integer. + idx_to_token : list of strs + A list of indexed tokens where the list indices and the token indices are aligned. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + reserved_tokens : list of strs or None + A list of reserved tokens that will always be indexed. + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. The largest valid index maps to the initialized embedding vector for every + reserved token, such as an unknown_token token and a padding token. + """ + def __init__(self, unknown_token='', reserved_tokens=None): + self._unknown_token = unknown_token + # Thus, constants.UNKNOWN_IDX must be 0. + self._idx_to_token = [unknown_token] + + if reserved_tokens is None: + self._reserved_tokens = None + else: + self._reserved_tokens = reserved_tokens[:] + self._idx_to_token.extend(reserved_tokens) + + self._token_to_idx = {token: idx for idx, token in enumerate(self._idx_to_token)} + self._idx_to_vec = None + + @classmethod + def _get_download_file_name(cls, pretrained_file_name): + return pretrained_file_name + + @classmethod + def _get_pretrained_file_url(cls, pretrained_file_name): + cls_name = cls.__name__.lower() + + namespace = 'gluon/embedding/{}'.format(cls_name) + return _get_repo_file_url(namespace, cls._get_download_file_name(pretrained_file_name)) + + @classmethod + def _get_pretrained_file(cls, embedding_root, pretrained_file_name): + cls_name = cls.__name__.lower() + embedding_root = os.path.expanduser(embedding_root) + url = cls._get_pretrained_file_url(pretrained_file_name) + + embedding_dir = os.path.join(embedding_root, cls_name) + pretrained_file_path = os.path.join(embedding_dir, pretrained_file_name) + downloaded_file = os.path.basename(url) + downloaded_file_path = os.path.join(embedding_dir, downloaded_file) + + expected_file_hash = cls.pretrained_file_name_sha1[pretrained_file_name] + + if hasattr(cls, 'pretrained_archive_name_sha1'): + expected_downloaded_hash = \ + cls.pretrained_archive_name_sha1[downloaded_file] + else: + expected_downloaded_hash = expected_file_hash + + if not os.path.exists(pretrained_file_path) \ + or not check_sha1(pretrained_file_path, expected_file_hash): + download(url, downloaded_file_path, sha1_hash=expected_downloaded_hash) + + ext = os.path.splitext(downloaded_file)[1] + if ext == '.zip': + with zipfile.ZipFile(downloaded_file_path, 'r') as zf: + zf.extractall(embedding_dir) + elif ext == '.gz': + with tarfile.open(downloaded_file_path, 'r:gz') as tar: + tar.extractall(path=embedding_dir) + return pretrained_file_path + + def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, encoding='utf8'): + """Load embedding vectors from the pre-trained token embedding file. + For every unknown token, if its representation `self.unknown_token` is encountered in the + pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token + embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the + text embedding vector initialized by `init_unknown_vec`. + If a token is encountered multiple times in the pre-trained text embedding file, only the + first-encountered token embedding vector will be loaded and the rest will be skipped. + """ + + pretrained_file_path = os.path.expanduser(pretrained_file_path) + + if not os.path.isfile(pretrained_file_path): + raise ValueError('`pretrained_file_path` must be a valid path to ' + 'the pre-trained token embedding file.') + + logging.info('Loading pre-trained token embedding vectors from %s', pretrained_file_path) + vec_len = None + all_elems = [] + tokens = set() + loaded_unknown_vec = None + line_num = 0 + with io.open(pretrained_file_path, 'r', encoding=encoding) as f: + for line in f: + line_num += 1 + elems = line.rstrip().split(elem_delim) + + assert len(elems) > 1, 'At line %d of the pre-trained text embedding file: the ' \ + 'data format of the pre-trained token embedding file %s ' \ + 'is unexpected.' % (line_num, pretrained_file_path) + + token, elems = elems[0], [float(i) for i in elems[1:]] + + if token == self.unknown_token and loaded_unknown_vec is None: + loaded_unknown_vec = elems + tokens.add(self.unknown_token) + elif token in tokens: + warnings.warn('At line %d of the pre-trained token embedding file: the ' + 'embedding vector for token %s has been loaded and a duplicate ' + 'embedding for the same token is seen and skipped.' % + (line_num, token)) + elif len(elems) == 1: + warnings.warn('At line %d of the pre-trained text embedding file: token %s ' + 'with 1-dimensional vector %s is likely a header and is ' + 'skipped.' % (line_num, token, elems)) + else: + if vec_len is None: + vec_len = len(elems) + # Reserve a vector slot for the unknown token at the very beggining because + # the unknown index is 0. + all_elems.extend([0] * vec_len) + else: + assert len(elems) == vec_len, \ + 'At line %d of the pre-trained token embedding file: the dimension ' \ + 'of token %s is %d but the dimension of previous tokens is %d. ' \ + 'Dimensions of all the tokens must be the same.' \ + % (line_num, token, len(elems), vec_len) + all_elems.extend(elems) + self._idx_to_token.append(token) + self._token_to_idx[token] = len(self._idx_to_token) - 1 + tokens.add(token) + + self._idx_to_vec = nd.array(all_elems).reshape((-1, vec_len)) + + if loaded_unknown_vec is None: + self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec(shape=vec_len) + else: + self._idx_to_vec[C.UNKNOWN_IDX] = nd.array(loaded_unknown_vec) + + @property + def idx_to_vec(self): + return self._idx_to_vec + + @property + def unknown_token(self): + return self._unknown_token + + @property + def reserved_tokens(self): + return self._reserved_tokens + + def __getitem__(self, tokens): + """Look up embedding vectors of tokens. + Parameters + ---------- + tokens : str or list of strs + A token or a list of tokens. + Returns + ------- + mxnet.ndarray.NDArray: + The embedding vector(s) of the token(s). According to numpy conventions, if `tokens` is + a string, returns a 1-D NDArray (vector); if `tokens` is a list of + strings, returns a 2-D NDArray (matrix) of shape=(len(tokens), vec_len). + """ + + to_reduce = not isinstance(tokens, (list, tuple)) + if to_reduce: + tokens = [tokens] + + indices = [self._token_to_idx.get(token, C.UNKNOWN_IDX) for token in tokens] + + vecs = nd.Embedding(nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0], + self.idx_to_vec.shape[1]) + + return vecs[0] if to_reduce else vecs + + def __contains__(self, x): + return x in self._token_to_idx + + def __setitem__(self, tokens, new_vectors): + """Updates embedding vectors for tokens. + Parameters + ---------- + tokens : str or a list of strs + A token or a list of tokens whose embedding vector are to be updated. + new_vectors : mxnet.ndarray.NDArray + An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal + to the number of `tokens` and its width must be equal to the dimension of embedding of + the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list + of multiple strings, it must be 2-D. + """ + + assert self._idx_to_vec is not None, '`idx_to_vec` has not been initialized.' + + if not isinstance(tokens, list) or len(tokens) == 1: + assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) in [1, 2], \ + '`new_vectors` must be a 1-D or 2-D NDArray if `tokens` is a singleton.' + if not isinstance(tokens, list): + tokens = [tokens] + if len(new_vectors.shape) == 1: + new_vectors = new_vectors.expand_dims(0) + + else: + assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) == 2, \ + '`new_vectors` must be a 2-D NDArray if `tokens` is a list of multiple strings.' + assert new_vectors.shape == (len(tokens), self._idx_to_vec.shape[1]), \ + 'The length of new_vectors must be equal to the number of tokens and the width of' \ + 'new_vectors must be equal to the dimension of embedding of the glossary.' + + indices = [] + for token in tokens: + if token in self._token_to_idx: + indices.append(self._token_to_idx[token]) + else: + raise ValueError('Token %s is unknown. To update the embedding vector for an ' + 'unknown token, please specify it explicitly as the ' + '`unknown_token` %s in `tokens`. This is to avoid unintended ' + 'updates.' % (token, self._idx_to_token[C.UNKNOWN_IDX])) + + self._idx_to_vec[nd.array(indices)] = new_vectors + + @classmethod + def _check_pretrained_file_names(cls, pretrained_file_name): + """Checks if a pre-trained token embedding file name is valid. + Parameters + ---------- + pretrained_file_name : str + The pre-trained token embedding file. + """ + + embedding_name = cls.__name__.lower() + if pretrained_file_name not in cls.pretrained_file_name_sha1: + raise KeyError('Cannot find pretrained file %s for token embedding %s. Valid ' + 'pretrained files for embedding %s: %s' % + (pretrained_file_name, embedding_name, embedding_name, + ', '.join(cls.pretrained_file_name_sha1.keys()))) + + @staticmethod + def from_file(pretrained_file_path, elem_delim=' ', encoding='utf8', + init_unknown_vec=nd.zeros, **kwargs): + """Load user-defined token embedding. + This is to load embedding vectors from a user-defined pre-trained text embedding file. + Denote by '[ed]' the argument `elem_delim`. Denote by [v_ij] the j-th element of the token + embedding vector for [token_i], the expected format of a custom pre-trained token embedding file + is: + '[token_1][ed][v_11][ed][v_12][ed]...[ed][v_1k]\\\\n[token_2][ed][v_21][ed][v_22][ed]...[ed] + [v_2k]\\\\n...' + where k is the length of the embedding vector `vec_len`. + Parameters + ---------- + pretrained_file_path : str + The path to the custom pre-trained token embedding file. + elem_delim : str, default ' ' + The delimiter for splitting a token and every embedding vector element value on the same + line of the custom pre-trained token embedding file. + encoding : str, default 'utf8' + The encoding scheme for reading the custom pre-trained token embedding file. + init_unknown_vec : callback + The callback used to initialize the embedding vector for the unknown token. + """ + embedding = TokenEmbedding(**kwargs) + embedding._load_embedding(pretrained_file_path, elem_delim, init_unknown_vec, encoding) + + return embedding + + +@register +class GloVe(TokenEmbedding): + """The GloVe word embedding. + GloVe is an unsupervised learning algorithm for obtaining vector representations for words. + Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and + the resulting representations showcase interesting linear substructures of the word vector + space. (Source from https://nlp.stanford.edu/projects/glove/) + Reference: + GloVe: Global Vectors for Word Representation. + Jeffrey Pennington, Richard Socher, and Christopher D. Manning. + https://nlp.stanford.edu/pubs/glove.pdf + Website: + https://nlp.stanford.edu/projects/glove/ + To get the updated URLs to the externally hosted pre-trained token embedding + files, visit https://nlp.stanford.edu/projects/glove/ + License for pre-trained embedding: + https://opendatacommons.org/licenses/pddl/ + Parameters + ---------- + pretrained_file_name : str, default 'glove.840B.300d.txt' + The name of the pre-trained token embedding file. + embedding_root : str, default os.path.join('~', '.mxnet', 'embedding') + The root directory for storing embedding-related files. + init_unknown_vec : callback + The callback used to initialize the embedding vector for the unknown token. + vocabulary : :class:`~mxnet.contrib.text.vocab.Vocabulary`, default None + It contains the tokens to index. Each indexed token will be associated with the loaded + embedding vectors, such as loaded from a pre-trained token embedding file. If None, all the + tokens from the loaded embedding vectors, such as loaded from a pre-trained token embedding + file, will be indexed. + Properties + ---------- + token_to_idx : dict mapping str to int + A dict mapping each token to its index integer. + idx_to_token : list of strs + A list of indexed tokens where the list indices and the token indices are aligned. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + reserved_tokens : list of strs or None + A list of reserved tokens that will always be indexed. + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. The largest valid index maps to the initialized embedding vector for every + reserved token, such as an unknown_token token and a padding token. + """ + + # Map a pre-trained token embedding archive file and its SHA-1 hash. + pretrained_archive_name_sha1 = C.GLOVE_PRETRAINED_FILE_SHA1 + + # Map a pre-trained token embedding file and its SHA-1 hash. + pretrained_file_name_sha1 = C.GLOVE_PRETRAINED_ARCHIVE_SHA1 + + @classmethod + def _get_download_file_name(cls, pretrained_file_name): + # Map a pre-trained embedding file to its archive to download. + src_archive = {archive.split('.')[1]: archive for archive in + GloVe.pretrained_archive_name_sha1.keys()} + archive = src_archive[pretrained_file_name.split('.')[1]] + return archive + + def __init__(self, pretrained_file_name='glove.840B.300d.txt', + embedding_root=os.path.join('~', '.mxnet', 'embedding'), + init_unknown_vec=nd.zeros, **kwargs): + GloVe._check_pretrained_file_names(pretrained_file_name) + + super(GloVe, self).__init__(**kwargs) + pretrained_file_path = GloVe._get_pretrained_file(embedding_root, pretrained_file_name) + + self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) + + +@register +class FastText(TokenEmbedding): + """The fastText word embedding. + FastText is an open-source, free, lightweight library that allows users to learn text + representations and text classifiers. It works on standard, generic hardware. Models can later + be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/) + References: + Enriching Word Vectors with Subword Information. + Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. + https://arxiv.org/abs/1607.04606 + Bag of Tricks for Efficient Text Classification. + Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov. + https://arxiv.org/abs/1607.01759 + FastText.zip: Compressing text classification models. + Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, + and Tomas Mikolov. + https://arxiv.org/abs/1612.03651 + For 'wiki.multi' embedding: + Word Translation Without Parallel Data + Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, + and Herve Jegou. + https://arxiv.org/abs/1710.04087 + Website: + https://fasttext.cc/ + To get the updated URLs to the externally hosted pre-trained token embedding files, visit + https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md + License for pre-trained embedding: + https://creativecommons.org/licenses/by-sa/3.0/ + Parameters + ---------- + pretrained_file_name : str, default 'wiki.en.vec' + The name of the pre-trained token embedding file. + embedding_root : str, default os.path.join('~', '.mxnet', 'embedding') + The root directory for storing embedding-related files. + init_unknown_vec : callback + The callback used to initialize the embedding vector for the unknown token. + Properties + ---------- + token_to_idx : dict mapping str to int + A dict mapping each token to its index integer. + idx_to_token : list of strs + A list of indexed tokens where the list indices and the token indices are aligned. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + reserved_tokens : list of strs or None + A list of reserved tokens that will always be indexed. + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. The largest valid index maps to the initialized embedding vector for every + reserved token, such as an unknown_token token and a padding token. + """ + + # Map a pre-trained token embedding archive file and its SHA-1 hash. + pretrained_archive_name_sha1 = C.FAST_TEXT_ARCHIVE_SHA1 + + # Map a pre-trained token embedding file and its SHA-1 hash. + pretrained_file_name_sha1 = C.FAST_TEXT_FILE_SHA1 + + @classmethod + def _get_download_file_name(cls, pretrained_file_name): + # Map a pre-trained embedding file to its archive to download. + return '.'.join(pretrained_file_name.split('.')[:-1])+'.zip' + + def __init__(self, pretrained_file_name='wiki.simple.vec', + embedding_root=os.path.join('~', '.mxnet', 'embedding'), + init_unknown_vec=nd.zeros, **kwargs): + FastText._check_pretrained_file_names(pretrained_file_name) + + super(FastText, self).__init__(**kwargs) + pretrained_file_path = FastText._get_pretrained_file(embedding_root, pretrained_file_name) + + self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) diff --git a/python/mxnet/text/utils.py b/python/mxnet/text/utils.py new file mode 100644 index 000000000000..d167310bdf40 --- /dev/null +++ b/python/mxnet/text/utils.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 + +"""Provide utilities for text data processing.""" +from __future__ import absolute_import +from __future__ import print_function + +import collections +import re + + +def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n', to_lower=False, + counter_to_update=None): + """Counts tokens in the specified string. + + + For token_delim='' and seq_delim='', a specified string of two sequences of tokens may + look like:: + + token1token2token3token4token5 + + + Parameters + ---------- + source_str : str + A source string of tokens. + token_delim : str, default ' ' + A token delimiter. + seq_delim : str, default '\\\\n' + A sequence delimiter. + to_lower : bool, default False + Whether to convert the source source_str to the lower case. + counter_to_update : collections.Counter or None, default None + The collections.Counter instance to be updated with the token counts of `source_str`. If + None, return a new collections.Counter instance counting tokens from `source_str`. + + + Returns + ------- + collections.Counter + The `counter_to_update` collections.Counter instance after being updated with the token + counts of `source_str`. If `counter_to_update` is None, return a new collections.Counter + instance counting tokens from `source_str`. + + + Examples + -------- + >>> source_str = ' Life is great ! \\n life is good . \\n' + >>> count_tokens_from_str(token_line, ' ', '\\n', True) + Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2}) + """ + + source_str = filter(None, re.split(token_delim + '|' + seq_delim, source_str)) + if to_lower: + source_str = [t.lower() for t in source_str] + + if counter_to_update is None: + return collections.Counter(source_str) + else: + counter_to_update.update(source_str) + return counter_to_update diff --git a/python/mxnet/text/vocab.py b/python/mxnet/text/vocab.py new file mode 100644 index 000000000000..6c3306300c12 --- /dev/null +++ b/python/mxnet/text/vocab.py @@ -0,0 +1,254 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=consider-iterating-dictionary + +"""Vocabulary.""" +from __future__ import absolute_import +from __future__ import print_function + +import collections + +from . import _constants as C +from .embedding import TokenEmbedding +from .. import nd + + +class Vocabulary(object): + """Vocabulary for indexing text tokens and access embedding. + + + Parameters + ---------- + counter : collections.Counter or None, default None + Counts text token frequencies in the text data. Its keys will be indexed according to + frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`, + `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. + Examples: str, int, and tuple. + max_size : None or int, default None + The maximum possible number of the most frequent tokens in the keys of `counter` that can be + indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose + that there are different keys of `counter` whose frequency are the same, if indexing all of + them will exceed this argument value, such keys will be indexed one by one according to + their __cmp__() order until the frequency threshold is met. If this argument is None or + larger than its largest possible value restricted by `counter` and `reserved_tokens`, this + argument has no effect. + min_freq : int, default 1 + The minimum frequency required for a token in the keys of `counter` to be indexed. + unknown_token : hashable object, default '<unk>' + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. Keys of `counter`, `unknown_token`, and values of + `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. + reserved_tokens : list of hashable objects or None, default None + A list of reserved tokens that will always be indexed, such as special symbols representing + padding, beginning of sentence, and end of sentence. It cannot contain `unknown_token`, or + duplicate reserved tokens. Keys of `counter`, `unknown_token`, and values of + `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. + + + Properties + ---------- + token_to_idx : dict mapping str to int + A dict mapping each token to its index integer. + idx_to_token : list of strs + A list of indexed tokens where the list indices and the token indices are aligned. + unknown_token : hashable object + The representation for any unknown token. In other words, any unknown token will be indexed + as the same representation. + reserved_tokens : list of strs or None + A list of reserved tokens that will always be indexed. + """ + + def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None, embedding=None): + + # Sanity checks. + assert min_freq > 0, '`min_freq` must be set to a positive value.' + + if reserved_tokens is not None: + reserved_token_set = set(reserved_tokens) + assert unknown_token not in reserved_token_set, \ + '`reserved_token` cannot contain `unknown_token`.' + assert len(reserved_token_set) == len(reserved_tokens), \ + '`reserved_tokens` cannot contain duplicate reserved tokens.' + + self._index_unknown_and_reserved_tokens(unknown_token, reserved_tokens) + + if counter is not None: + self._index_counter_keys(counter, unknown_token, reserved_tokens, max_size, min_freq) + + if embedding is None: + self._embedding = None + else: + self.set_embedding(embedding) + + def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens): + """Indexes unknown and reserved tokens.""" + + self._unknown_token = unknown_token + self._idx_to_token = [unknown_token] + + if reserved_tokens is None: + self._reserved_tokens = None + else: + self._reserved_tokens = reserved_tokens[:] + self._idx_to_token.extend(reserved_tokens) + + self._token_to_idx = {token: idx for idx, token in enumerate(self._idx_to_token)} + + def _index_counter_keys(self, counter, unknown_token, reserved_tokens, max_size, + min_freq): + """Indexes keys of `counter`. + Indexes keys of `counter` according to frequency thresholds such as `max_size` and + `min_freq`. + """ + + assert isinstance(counter, collections.Counter), \ + '`counter` must be an instance of collections.Counter.' + + unknown_and_reserved_tokens = set(reserved_tokens) if reserved_tokens is not None else set() + unknown_and_reserved_tokens.add(unknown_token) + + token_freqs = sorted(counter.items(), key=lambda x: x[0]) + token_freqs.sort(key=lambda x: x[1], reverse=True) + + token_cap = len(unknown_and_reserved_tokens) + ( + len(counter) if max_size is None else max_size) + + for token, freq in token_freqs: + if freq < min_freq or len(self._idx_to_token) == token_cap: + break + if token not in unknown_and_reserved_tokens: + self._idx_to_token.append(token) + self._token_to_idx[token] = len(self._idx_to_token) - 1 + + def __len__(self): + return len(self._idx_to_token) + + @property + def token_to_idx(self): + return self._token_to_idx + + @property + def idx_to_token(self): + return self._idx_to_token + + @property + def unknown_token(self): + return self._unknown_token + + @property + def reserved_tokens(self): + return self._reserved_tokens + + @property + def embedding(self): + return self._embedding + + def set_embedding(self, *embeddings): + + if not isinstance(embeddings, (list, tuple)): + embeddings = [embeddings] + + for embedding in embeddings: + assert isinstance(embedding, TokenEmbedding), \ + 'The argument `embedding` must be an instance or a list of instances ' \ + 'of `mxnet.contrib.text.embedding.TextEmbedding` whose embedding vectors will be' \ + 'loaded or concatenated-then-loaded to map to the indexed tokens.' + + new_embedding = TokenEmbedding(self._unknown_token, self._reserved_tokens) + new_embedding._token_to_idx = self._token_to_idx + new_embedding._idx_to_token = self._idx_to_token + + new_vec_len = sum(embedding.idx_to_vec.shape[1] for embedding in embeddings + if embedding and embedding.idx_to_vec is not None) + new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len)) + + col_start = 0 + # Concatenate all the embedding vectors in embedding. + for embedding in embeddings: + if embedding and embedding.idx_to_vec is not None: + col_end = col_start + embedding.idx_to_vec.shape[1] + # Cancatenate vectors of the unknown token. + new_idx_to_vec[0, col_start:col_end] = embedding[0] + new_idx_to_vec[1:, col_start:col_end] = embedding[self._idx_to_token[1:]] + col_start = col_end + + new_embedding._idx_to_vec = new_idx_to_vec + self._embedding = new_embedding + + def to_tokens(self, indices): + """Converts token indices to tokens according to the vocabulary. + Parameters + ---------- + indices : int or list of ints + A source token index or token indices to be converted. + Returns + ------- + str or list of strs + A token or a list of tokens according to the vocabulary. + """ + + to_reduce = False + if not isinstance(indices, (list, tuple)): + indices = [indices] + to_reduce = True + + max_idx = len(self._idx_to_token) - 1 + + tokens = [] + for idx in indices: + if not isinstance(idx, int) or idx > max_idx: + raise ValueError('Token index %d in the provided `indices` is invalid.' % idx) + else: + tokens.append(self._idx_to_token[idx]) + + return tokens[0] if to_reduce else tokens + + def __getitem__(self, s): + """Converts token/tokens to indices according to the vocabulary. + Parameters + ---------- + s : str or list of strs + A source token or tokens to be converted. + Returns + ------- + int or list of ints + A token index or a list of token indices according to the vocabulary. + """ + + if not isinstance(s, (list, tuple)): + return self._token_to_idx[s] if s in self._token_to_idx \ + else C.UNKNOWN_IDX + else: + return [self._token_to_idx[token] if token in self._token_to_idx + else C.UNKNOWN_IDX for token in s] + + def __contains__(self, s): + """Check whether token exists in the vocabulary. + Parameters + ---------- + s : str + A token. + Returns + ------- + int or list of ints + A token index or a list of token indices according to the vocabulary. + """ + + return s in self._token_to_idx diff --git a/tests/python/unittest/test_text.py b/tests/python/unittest/test_text.py new file mode 100644 index 000000000000..64fc311d4961 --- /dev/null +++ b/tests/python/unittest/test_text.py @@ -0,0 +1,648 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# 'License'); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 + +from __future__ import absolute_import +from __future__ import print_function + +from collections import Counter + +from common import assertRaises +from mxnet import ndarray as nd +from mxnet.test_utils import * +from mxnet import text + + +def _get_test_str_of_tokens(token_delim, seq_delim): + seq1 = token_delim + token_delim.join(['Life', 'is', 'great', '!']) + token_delim + seq_delim + seq2 = token_delim + token_delim.join(['life', 'is', 'good', '.']) + token_delim + seq_delim + seq3 = token_delim + token_delim.join(['life', "isn't", 'bad', '.']) + token_delim + seq_delim + seqs = seq1 + seq2 + seq3 + return seqs + + +def _test_count_tokens_from_str_with_delims(token_delim, seq_delim): + source_str = _get_test_str_of_tokens(token_delim, seq_delim) + + cnt1 = text.utils.count_tokens_from_str( + source_str, token_delim, seq_delim, to_lower=False) + assert cnt1 == Counter( + {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1, + 'bad': 1}) + + cnt2 = text.utils.count_tokens_from_str( + source_str, token_delim, seq_delim, to_lower=True) + assert cnt2 == Counter( + {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) + + counter_to_update = Counter({'life': 2}) + + cnt3 = text.utils.count_tokens_from_str( + source_str, token_delim, seq_delim, to_lower=False, + counter_to_update=counter_to_update.copy()) + assert cnt3 == Counter( + {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1, + 'bad': 1}) + + cnt4 = text.utils.count_tokens_from_str( + source_str, token_delim, seq_delim, to_lower=True, + counter_to_update=counter_to_update.copy()) + assert cnt4 == Counter( + {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) + + +def test_count_tokens_from_str(): + _test_count_tokens_from_str_with_delims(' ', '\n') + _test_count_tokens_from_str_with_delims('IS', 'LIFE') + + +def test_tokens_to_indices(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + vocab = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + + i1 = vocab['c'] + assert i1 == 1 + + i2 = vocab[['c']] + assert i2 == [1] + + i3 = vocab[['', 'non-exist']] + assert i3 == [0, 0] + + i4 = vocab[['a', 'non-exist', 'a', 'b']] + assert i4 == [3, 0, 3, 2] + + +def test_indices_to_tokens(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + vocab = text.Vocabulary(counter, max_size=None, min_freq=1, + unknown_token='', reserved_tokens=None) + i1 = vocab.to_tokens(1) + assert i1 == 'c' + + i2 = vocab.to_tokens([1]) + assert i2 == ['c'] + + i3 = vocab.to_tokens([0, 0]) + assert i3 == ['', ''] + + i4 = vocab.to_tokens([3, 0, 3, 2]) + assert i4 == ['a', '', 'a', 'b'] + + assertRaises(ValueError, vocab.to_tokens, 100) + + +def test_download_embed(): + @text.embedding.register + class Test(text.TokenEmbedding): + # 33 bytes. + pretrained_file_name_sha1 = \ + {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'} + namespace = 'test' + + def __init__(self, embedding_root='embeddings', init_unknown_vec=nd.zeros, **kwargs): + pretrained_file_name = 'embedding_test.vec' + Test._check_pretrained_file_names(pretrained_file_name) + + super(Test, self).__init__(**kwargs) + + pretrained_file_path = Test._get_pretrained_file(embedding_root, pretrained_file_name) + + self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) + + test_embed = text.embedding.create('test') + assert_almost_equal(test_embed['hello'].asnumpy(), (nd.arange(5) + 1).asnumpy()) + assert_almost_equal(test_embed['world'].asnumpy(), (nd.arange(5) + 6).asnumpy()) + assert_almost_equal(test_embed[''].asnumpy(), nd.zeros((5,)).asnumpy()) + + +def _mk_my_pretrain_file(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seqs = seq1 + seq2 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_pretrain_file2(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n' + seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n' + seqs = seq1 + seq2 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_pretrain_file3(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seq3 = token_delim.join(['', '1.1', '1.2', '1.3', '1.4', + '1.5']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_pretrain_file4(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n' + seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n' + seq3 = token_delim.join(['', '0.11', '0.12', '0.13', '0.14', '0.15']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_invalid_pretrain_file(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seq3 = token_delim.join(['c']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def _mk_my_invalid_pretrain_file2(path, token_delim, pretrain_file): + path = os.path.expanduser(path) + if not os.path.exists(path): + os.makedirs(path) + seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n' + seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n' + seq3 = token_delim.join(['c', '0.6', '0.7', '0.8']) + '\n' + seqs = seq1 + seq2 + seq3 + with open(os.path.join(path, pretrain_file), 'w') as fout: + fout.write(seqs) + + +def test_custom_embed(): + embed_root = 'embedding' + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file = 'my_pretrain_file.txt' + + _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file) + + pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) + + my_embed = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim) + + assert len(my_embed._idx_to_token) == 3 + assert my_embed._token_to_idx['a'] == 1 + assert my_embed._idx_to_token[1] == 'a' + + first_vec = my_embed.idx_to_vec[0] + assert_almost_equal(first_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) + + unk_vec = my_embed['A'] + assert_almost_equal(unk_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) + + a_vec = my_embed['a'] + assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5])) + + unk_vecs = my_embed['', ''] + assert_almost_equal(unk_vecs.asnumpy(), np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) + + # Test loaded unknown vectors. + pretrain_file2 = 'my_pretrain_file2.txt' + _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2) + pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2) + my_embed2 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, + init_unknown_vec=nd.ones, unknown_token='') + unk_vec2 = my_embed2[''] + assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) + unk_vec2 = my_embed2[''] + assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) + + my_embed3 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, + init_unknown_vec=nd.ones, unknown_token='') + unk_vec3 = my_embed3[''] + assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) + unk_vec3 = my_embed3[''] + assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) + + # Test error handling. + invalid_pretrain_file = 'invalid_pretrain_file.txt' + _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, + invalid_pretrain_file) + pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file) + assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, elem_delim) + + invalid_pretrain_file2 = 'invalid_pretrain_file2.txt' + _mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, + invalid_pretrain_file2) + pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file2) + assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, elem_delim) + + +def test_vocabulary(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v1) == 5 + assert v1.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4} + assert v1.idx_to_token[1] == 'c' + assert v1.unknown_token == '' + assert v1.reserved_tokens is None + + v2 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', + reserved_tokens=None) + assert len(v2) == 3 + assert v2.token_to_idx == {'': 0, 'c': 1, 'b': 2} + assert v2.idx_to_token[1] == 'c' + assert v2.unknown_token == '' + assert v2.reserved_tokens is None + + v3 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', + reserved_tokens=None) + assert len(v3) == 1 + assert v3.token_to_idx == {'': 0} + assert v3.idx_to_token[0] == '' + assert v3.unknown_token == '' + assert v3.reserved_tokens is None + + v4 = text.Vocabulary(counter, max_size=2, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v4) == 3 + assert v4.token_to_idx == {'': 0, 'c': 1, 'b': 2} + assert v4.idx_to_token[1] == 'c' + assert v4.unknown_token == '' + assert v4.reserved_tokens is None + + v5 = text.Vocabulary(counter, max_size=3, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v5) == 4 + assert v5.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3} + assert v5.idx_to_token[1] == 'c' + assert v5.unknown_token == '' + assert v5.reserved_tokens is None + + v6 = text.Vocabulary(counter, max_size=100, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v6) == 5 + assert v6.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, + 'some_word$': 4} + assert v6.idx_to_token[1] == 'c' + assert v6.unknown_token == '' + assert v6.reserved_tokens is None + + v7 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', + reserved_tokens=None) + assert len(v7) == 2 + assert v7.token_to_idx == {'': 0, 'c': 1} + assert v7.idx_to_token[1] == 'c' + assert v7.unknown_token == '' + assert v7.reserved_tokens is None + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=0, unknown_token='', reserved_tokens=['b']) + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=1, unknown_token='', reserved_tokens=['b', 'b']) + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=1, unknown_token='', reserved_tokens=['b', '']) + + v8 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=['b']) + assert len(v8) == 5 + assert v8.token_to_idx == {'': 0, 'b': 1, 'c': 2, 'a': 3, 'some_word$': 4} + assert v8.idx_to_token[1] == 'b' + assert v8.unknown_token == '' + assert v8.reserved_tokens == ['b'] + + v9 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', + reserved_tokens=['b', 'a']) + assert len(v9) == 4 + assert v9.token_to_idx == {'': 0, 'b': 1, 'a': 2, 'c': 3} + assert v9.idx_to_token[1] == 'b' + assert v9.unknown_token == '' + assert v9.reserved_tokens == ['b', 'a'] + + v10 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', + reserved_tokens=['b', 'c']) + assert len(v10) == 3 + assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2} + assert v10.idx_to_token[1] == 'b' + assert v10.unknown_token == '' + assert v10.reserved_tokens == ['b', 'c'] + + v11 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', + reserved_tokens=['', 'b']) + assert len(v11) == 4 + assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3} + assert v11.idx_to_token[1] == '' + assert v11.unknown_token == '' + assert v11.reserved_tokens == ['', 'b'] + + v12 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='b', + reserved_tokens=['']) + assert len(v12) == 3 + assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2} + assert v12.idx_to_token[1] == '' + assert v12.unknown_token == 'b' + assert v12.reserved_tokens == [''] + + v13 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='a', + reserved_tokens=['']) + assert len(v13) == 4 + assert v13.token_to_idx == {'a': 0, '': 1, 'c': 2, 'b': 3} + assert v13.idx_to_token[1] == '' + assert v13.unknown_token == 'a' + assert v13.reserved_tokens == [''] + + counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), ('c', 'c'), ('c', 'c'), ('c', 'c'), + ('some_word$', 'some_word$')]) + + v14 = text.Vocabulary(counter_tuple, max_size=None, min_freq=1, + unknown_token=('', ''), reserved_tokens=None) + assert len(v14) == 5 + assert v14.token_to_idx == {('', ''): 0, ('c', 'c'): 1, ('b', 'b'): 2, ('a', 'a'): 3, + ('some_word$', 'some_word$'): 4} + assert v14.idx_to_token[1] == ('c', 'c') + assert v14.unknown_token == ('', '') + assert v14.reserved_tokens is None + + +def test_custom_embedding_with_vocabulary(): + embed_root = 'embedding' + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file = 'my_pretrain_file1.txt' + + _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file) + + pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file) + + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=['']) + + e1 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, + init_unknown_vec=nd.ones) + + v1.set_embedding(e1) + + assert v1.embedding._token_to_idx == {'': 0, '': 1, 'c': 2, 'b': 3, 'a': 4, 'some_word$': 5} + assert v1.embedding._idx_to_token == ['', '', 'c', 'b', 'a', 'some_word$'] + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [0.6, 0.7, 0.8, 0.9, 1], + [0.1, 0.2, 0.3, 0.4, 0.5], + [1, 1, 1, 1, 1]]) + ) + + assert v1.embedding.reserved_tokens == [''] + + assert_almost_equal(v1.embedding['c'].asnumpy(), + np.array([1, 1, 1, 1, 1]) + ) + + assert_almost_equal(v1.embedding[['c']].asnumpy(), + np.array([[1, 1, 1, 1, 1]]) + ) + + assert_almost_equal(v1.embedding[['a', 'not_exist']].asnumpy(), + np.array([[0.1, 0.2, 0.3, 0.4, 0.5], + [1, 1, 1, 1, 1]]) + ) + + assert_almost_equal(v1.embedding[['a', 'b']].asnumpy(), + np.array([[0.1, 0.2, 0.3, 0.4, 0.5], + [0.6, 0.7, 0.8, 0.9, 1]]) + ) + + assert_almost_equal(v1.embedding[['A', 'b']].asnumpy(), + np.array([[1, 1, 1, 1, 1], + [0.6, 0.7, 0.8, 0.9, 1]]) + ) + + v1.embedding['a'] = nd.array([2, 2, 2, 2, 2]) + v1.embedding['b'] = nd.array([3, 3, 3, 3, 3]) + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [3, 3, 3, 3, 3], + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1]]) + ) + + assertRaises(ValueError, e1.__setitem__, 'unknown$$$', nd.array([0, 0, 0, 0, 0])) + + assertRaises(AssertionError, e1.__setitem__, '', + nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) + + assertRaises(AssertionError, e1.__setitem__, '', nd.array([0])) + + v1.embedding[''] = nd.array([0, 0, 0, 0, 0]) + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [3, 3, 3, 3, 3], + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1]]) + ) + v1.embedding[''] = nd.array([10, 10, 10, 10, 10]) + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[10, 10, 10, 10, 10], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [3, 3, 3, 3, 3], + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1]]) + ) + + +def test_composite_embedding_with_two_embeddings(): + embed_root = '.' + embed_name = 'my_embed' + elem_delim = '\t' + pretrain_file1 = 'my_pretrain_file1.txt' + pretrain_file2 = 'my_pretrain_file2.txt' + + _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file1) + _mk_my_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2) + + pretrain_file_path1 = os.path.join(embed_root, embed_name, pretrain_file1) + pretrain_file_path2 = os.path.join(embed_root, embed_name, pretrain_file2) + + my_embed1 = text.embedding.TokenEmbedding.from_file(pretrain_file_path1, elem_delim, + init_unknown_vec=nd.ones) + my_embed2 = text.embedding.TokenEmbedding.from_file(pretrain_file_path2, elem_delim) + + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) + + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v1.set_embedding(my_embed1, my_embed2) + + assert v1._token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4} + assert v1._idx_to_token == ['', 'c', 'b', 'a', 'some_word$'] + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) + ) + + assert v1.embedding.reserved_tokens is None + assert_almost_equal(v1.embedding['c'].asnumpy(), + np.array([1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1]) + ) + + assert_almost_equal(v1.embedding[['b', 'not_exist']].asnumpy(), + np.array([[0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) + ) + + v1.embedding['a'] = nd.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) + v1.embedding['b'] = nd.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3]) + + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), + np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], + [3, 3, 3, 3, 3, 3, 3, 3, 3, 3], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) + ) + + # Test loaded unknown tokens + pretrain_file3 = 'my_pretrain_file3.txt' + pretrain_file4 = 'my_pretrain_file4.txt' + + _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file3) + _mk_my_pretrain_file4(os.path.join(embed_root, embed_name), elem_delim, pretrain_file4) + + pretrain_file_path3 = os.path.join(embed_root, embed_name, pretrain_file3) + pretrain_file_path4 = os.path.join(embed_root, embed_name, pretrain_file4) + + my_embed3 = text.embedding.TokenEmbedding.from_file(pretrain_file_path3, elem_delim, + init_unknown_vec=nd.ones, unknown_token='') + my_embed4 = text.embedding.TokenEmbedding.from_file(pretrain_file_path4, elem_delim, + unknown_token='') + + v2 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v2.set_embedding(my_embed3, my_embed4) + assert_almost_equal(v2.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + v3 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v3.set_embedding(my_embed3, my_embed4) + assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + v4 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + v4.set_embedding(my_embed3, my_embed4) + assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [0.1, 0.2, 0.3, 0.4, 0.5, + 0.01, 0.02, 0.03, 0.04, 0.05], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + counter2 = Counter(['b', 'b', 'c', 'c', 'c', 'some_word$']) + + v5 = text.Vocabulary(counter2, max_size=None, min_freq=1, unknown_token='a', + reserved_tokens=None) + v5.set_embedding(my_embed3, my_embed4) + assert v5.embedding._token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3} + assert v5.embedding._idx_to_token == ['a', 'c', 'b', 'some_word$'] + assert_almost_equal(v5.embedding.idx_to_vec.asnumpy(), + np.array([[1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.06, 0.07, 0.08, 0.09, 0.1], + [0.6, 0.7, 0.8, 0.9, 1, + 0.11, 0.12, 0.13, 0.14, 0.15], + [1.1, 1.2, 1.3, 1.4, 1.5, + 0.11, 0.12, 0.13, 0.14, 0.15]]) + ) + + +def test_get_and_pretrain_file_names(): + assert len(text.embedding.get_pretrained_file_names( + embedding_name='fasttext')) == 327 + + assert len(text.embedding.get_pretrained_file_names(embedding_name='glove')) == 10 + + reg = text.embedding.get_pretrained_file_names(embedding_name=None) + + assert len(reg['glove']) == 10 + assert len(reg['fasttext']) == 327 + + assertRaises(KeyError, text.embedding.get_pretrained_file_names, 'unknown$$') + + +if __name__ == '__main__': + import nose + nose.runmodule() From 9c806f5d4c46cd8163f110910cbd2c9a0669c36b Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Sun, 11 Mar 2018 19:20:45 -0700 Subject: [PATCH 06/20] Code and test revised --- python/mxnet/text/embedding.py | 8 +- python/mxnet/text/vocab.py | 96 +++---- tests/python/unittest/test_text.py | 399 ++++++++++++++++------------- 3 files changed, 269 insertions(+), 234 deletions(-) diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py index d40554a97e06..12d46b5c18c4 100644 --- a/python/mxnet/text/embedding.py +++ b/python/mxnet/text/embedding.py @@ -175,7 +175,7 @@ def _get_download_file_name(cls, pretrained_file_name): def _get_pretrained_file_url(cls, pretrained_file_name): cls_name = cls.__name__.lower() - namespace = 'gluon/embedding/{}'.format(cls_name) + namespace = 'gluon/embeddings/{}'.format(cls_name) return _get_repo_file_url(namespace, cls._get_download_file_name(pretrained_file_name)) @classmethod @@ -291,6 +291,9 @@ def unknown_token(self): def reserved_tokens(self): return self._reserved_tokens + def __contains__(self, x): + return x in self._token_to_idx + def __getitem__(self, tokens): """Look up embedding vectors of tokens. Parameters @@ -316,9 +319,6 @@ def __getitem__(self, tokens): return vecs[0] if to_reduce else vecs - def __contains__(self, x): - return x in self._token_to_idx - def __setitem__(self, tokens, new_vectors): """Updates embedding vectors for tokens. Parameters diff --git a/python/mxnet/text/vocab.py b/python/mxnet/text/vocab.py index 6c3306300c12..d31e2640abbc 100644 --- a/python/mxnet/text/vocab.py +++ b/python/mxnet/text/vocab.py @@ -75,7 +75,7 @@ class Vocabulary(object): """ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None, embedding=None): + reserved_tokens=None, embeddings=None): # Sanity checks. assert min_freq > 0, '`min_freq` must be set to a positive value.' @@ -92,10 +92,10 @@ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token=' if counter is not None: self._index_counter_keys(counter, unknown_token, reserved_tokens, max_size, min_freq) - if embedding is None: + if embeddings is None: self._embedding = None else: - self.set_embedding(embedding) + self.set_embedding(embeddings) def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens): """Indexes unknown and reserved tokens.""" @@ -137,30 +137,63 @@ def _index_counter_keys(self, counter, unknown_token, reserved_tokens, max_size, self._idx_to_token.append(token) self._token_to_idx[token] = len(self._idx_to_token) - 1 - def __len__(self): - return len(self._idx_to_token) - @property - def token_to_idx(self): - return self._token_to_idx + def embedding(self): + return self._embedding @property def idx_to_token(self): return self._idx_to_token - @property - def unknown_token(self): - return self._unknown_token - @property def reserved_tokens(self): return self._reserved_tokens @property - def embedding(self): - return self._embedding + def token_to_idx(self): + return self._token_to_idx + + @property + def unknown_token(self): + return self._unknown_token + + def __contains__(self, s): + """Check whether token exists in the vocabulary. + Parameters + ---------- + s : str + A token. + Returns + ------- + int or list of ints + A token index or a list of token indices according to the vocabulary. + """ + + return s in self._token_to_idx + + def __getitem__(self, s): + """Converts token/tokens to indices according to the vocabulary. + Parameters + ---------- + s : str or list of strs + A source token or tokens to be converted. + Returns + ------- + int or list of ints + A token index or a list of token indices according to the vocabulary. + """ + + if not isinstance(s, (list, tuple)): + return self._token_to_idx[s] if s in self._token_to_idx \ + else C.UNKNOWN_IDX + else: + return [self._token_to_idx[token] if token in self._token_to_idx + else C.UNKNOWN_IDX for token in s] + + def __len__(self): + return len(self._idx_to_token) - def set_embedding(self, *embeddings): + def set_embedding(self, embeddings): if not isinstance(embeddings, (list, tuple)): embeddings = [embeddings] @@ -219,36 +252,3 @@ def to_tokens(self, indices): tokens.append(self._idx_to_token[idx]) return tokens[0] if to_reduce else tokens - - def __getitem__(self, s): - """Converts token/tokens to indices according to the vocabulary. - Parameters - ---------- - s : str or list of strs - A source token or tokens to be converted. - Returns - ------- - int or list of ints - A token index or a list of token indices according to the vocabulary. - """ - - if not isinstance(s, (list, tuple)): - return self._token_to_idx[s] if s in self._token_to_idx \ - else C.UNKNOWN_IDX - else: - return [self._token_to_idx[token] if token in self._token_to_idx - else C.UNKNOWN_IDX for token in s] - - def __contains__(self, s): - """Check whether token exists in the vocabulary. - Parameters - ---------- - s : str - A token. - Returns - ------- - int or list of ints - A token index or a list of token indices according to the vocabulary. - """ - - return s in self._token_to_idx diff --git a/tests/python/unittest/test_text.py b/tests/python/unittest/test_text.py index 64fc311d4961..f4adff75ee9b 100644 --- a/tests/python/unittest/test_text.py +++ b/tests/python/unittest/test_text.py @@ -71,7 +71,7 @@ def test_count_tokens_from_str(): _test_count_tokens_from_str_with_delims('IS', 'LIFE') -def test_tokens_to_indices(): +def test_vocabulary_getitem(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) vocab = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', @@ -90,7 +90,7 @@ def test_tokens_to_indices(): assert i4 == [3, 0, 3, 2] -def test_indices_to_tokens(): +def test_vocabulary_to_tokens(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) vocab = text.Vocabulary(counter, max_size=None, min_freq=1, @@ -107,31 +107,169 @@ def test_indices_to_tokens(): i4 = vocab.to_tokens([3, 0, 3, 2]) assert i4 == ['a', '', 'a', 'b'] - assertRaises(ValueError, vocab.to_tokens, 100) + assertRaises(ValueError, vocab.to_tokens, 5) + assertRaises(ValueError, vocab.to_tokens, [5, 6]) -def test_download_embed(): - @text.embedding.register - class Test(text.TokenEmbedding): - # 33 bytes. - pretrained_file_name_sha1 = \ - {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'} - namespace = 'test' +def test_vocabulary(): + counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - def __init__(self, embedding_root='embeddings', init_unknown_vec=nd.zeros, **kwargs): - pretrained_file_name = 'embedding_test.vec' - Test._check_pretrained_file_names(pretrained_file_name) + v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v1) == 5 + assert v1.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4} + assert v1.idx_to_token[1] == 'c' + assert v1.unknown_token == '' + assert v1.reserved_tokens is None + assert v1.embedding is None + assert 'a' in v1 + assert v1.unknown_token in v1 - super(Test, self).__init__(**kwargs) + v2 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', + reserved_tokens=None) + assert len(v2) == 3 + assert v2.token_to_idx == {'': 0, 'c': 1, 'b': 2} + assert v2.idx_to_token[1] == 'c' + assert v2.unknown_token == '' + assert v2.reserved_tokens is None + assert v2.embedding is None + assert 'a' not in v2 + assert v2.unknown_token in v2 - pretrained_file_path = Test._get_pretrained_file(embedding_root, pretrained_file_name) + v3 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', + reserved_tokens=None) + assert len(v3) == 1 + assert v3.token_to_idx == {'': 0} + assert v3.idx_to_token[0] == '' + assert v3.unknown_token == '' + assert v3.reserved_tokens is None + assert v3.embedding is None + assert 'a' not in v3 - self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) + v4 = text.Vocabulary(counter, max_size=2, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v4) == 3 + assert v4.token_to_idx == {'': 0, 'c': 1, 'b': 2} + assert v4.idx_to_token[1] == 'c' + assert v4.unknown_token == '' + assert v4.reserved_tokens is None + assert v4.embedding is None + assert 'a' not in v4 - test_embed = text.embedding.create('test') - assert_almost_equal(test_embed['hello'].asnumpy(), (nd.arange(5) + 1).asnumpy()) - assert_almost_equal(test_embed['world'].asnumpy(), (nd.arange(5) + 6).asnumpy()) - assert_almost_equal(test_embed[''].asnumpy(), nd.zeros((5,)).asnumpy()) + v5 = text.Vocabulary(counter, max_size=3, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v5) == 4 + assert v5.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3} + assert v5.idx_to_token[1] == 'c' + assert v5.unknown_token == '' + assert v5.reserved_tokens is None + assert v5.embedding is None + assert 'a' in v5 + + v6 = text.Vocabulary(counter, max_size=100, min_freq=1, unknown_token='', + reserved_tokens=None) + assert len(v6) == 5 + assert v6.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, + 'some_word$': 4} + assert v6.idx_to_token[1] == 'c' + assert v6.unknown_token == '' + assert v6.reserved_tokens is None + assert v6.embedding is None + assert 'a' in v6 + + v7 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', + reserved_tokens=None) + assert len(v7) == 2 + assert v7.token_to_idx == {'': 0, 'c': 1} + assert v7.idx_to_token[1] == 'c' + assert v7.unknown_token == '' + assert v7.reserved_tokens is None + assert v7.embedding is None + assert 'a' not in v7 + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=0, unknown_token='', reserved_tokens=['b']) + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=1, unknown_token='', reserved_tokens=['b', 'b']) + + assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, + min_freq=1, unknown_token='', reserved_tokens=['b', '']) + + v8 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', + reserved_tokens=['b']) + assert len(v8) == 5 + assert v8.token_to_idx == {'': 0, 'b': 1, 'c': 2, 'a': 3, 'some_word$': 4} + assert v8.idx_to_token[1] == 'b' + assert v8.unknown_token == '' + assert v8.reserved_tokens == ['b'] + assert v8.embedding is None + assert 'a' in v8 + + v9 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', + reserved_tokens=['b', 'a']) + assert len(v9) == 4 + assert v9.token_to_idx == {'': 0, 'b': 1, 'a': 2, 'c': 3} + assert v9.idx_to_token[1] == 'b' + assert v9.unknown_token == '' + assert v9.reserved_tokens == ['b', 'a'] + assert v9.embedding is None + assert 'a' in v9 + + v10 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', + reserved_tokens=['b', 'c']) + assert len(v10) == 3 + assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2} + assert v10.idx_to_token[1] == 'b' + assert v10.unknown_token == '' + assert v10.reserved_tokens == ['b', 'c'] + assert v10.embedding is None + assert 'a' not in v10 + + v11 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', + reserved_tokens=['', 'b']) + assert len(v11) == 4 + assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3} + assert v11.idx_to_token[1] == '' + assert v11.unknown_token == '' + assert v11.reserved_tokens == ['', 'b'] + assert v11.embedding is None + assert 'a' not in v11 + + v12 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='b', + reserved_tokens=['']) + assert len(v12) == 3 + assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2} + assert v12.idx_to_token[1] == '' + assert v12.unknown_token == 'b' + assert v12.reserved_tokens == [''] + assert v12.embedding is None + assert 'a' not in v12 + + v13 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='a', + reserved_tokens=['']) + assert len(v13) == 4 + assert v13.token_to_idx == {'a': 0, '': 1, 'c': 2, 'b': 3} + assert v13.idx_to_token[1] == '' + assert v13.unknown_token == 'a' + assert v13.reserved_tokens == [''] + assert v13.embedding is None + assert 'a' in v13 + + counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), ('c', 'c'), ('c', 'c'), ('c', 'c'), + ('some_word$', 'some_word$')]) + + v14 = text.Vocabulary(counter_tuple, max_size=None, min_freq=1, + unknown_token=('', ''), reserved_tokens=None) + assert len(v14) == 5 + assert v14.token_to_idx == {('', ''): 0, ('c', 'c'): 1, ('b', 'b'): 2, ('a', 'a'): 3, + ('some_word$', 'some_word$'): 4} + assert v14.idx_to_token[1] == ('c', 'c') + assert v14.unknown_token == ('', '') + assert v14.reserved_tokens is None + assert v14.embedding is None + assert ('a', 'a') in v14 + assert ('', '') in v14 def _mk_my_pretrain_file(path, token_delim, pretrain_file): @@ -205,7 +343,7 @@ def _mk_my_invalid_pretrain_file2(path, token_delim, pretrain_file): fout.write(seqs) -def test_custom_embed(): +def test_token_embedding_from_file(): embed_root = 'embedding' embed_name = 'my_embed' elem_delim = '\t' @@ -217,19 +355,31 @@ def test_custom_embed(): my_embed = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim) - assert len(my_embed._idx_to_token) == 3 - assert my_embed._token_to_idx['a'] == 1 - assert my_embed._idx_to_token[1] == 'a' + assert 'a' in my_embed + assert my_embed.unknown_token == '' + assert my_embed.reserved_tokens is None + assert my_embed.unknown_token in my_embed first_vec = my_embed.idx_to_vec[0] assert_almost_equal(first_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) + # Test __getitem__. unk_vec = my_embed['A'] assert_almost_equal(unk_vec.asnumpy(), np.array([0, 0, 0, 0, 0])) a_vec = my_embed['a'] assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5])) + # Test __setitem__. + my_embed['a'] = nd.array([1, 2, 3, 4, 5]) + assert_almost_equal(my_embed['a'].asnumpy(), np.array([1, 2, 3, 4, 5])) + assertRaises(ValueError, my_embed.__setitem__, 'unknown$$$', nd.array([0, 0, 0, 0, 0])) + + assertRaises(AssertionError, my_embed.__setitem__, '', + nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) + + assertRaises(AssertionError, my_embed.__setitem__, '', nd.array([0])) + unk_vecs = my_embed['', ''] assert_almost_equal(unk_vecs.asnumpy(), np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) @@ -238,14 +388,16 @@ def test_custom_embed(): _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2) pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2) my_embed2 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, - init_unknown_vec=nd.ones, unknown_token='') + init_unknown_vec=nd.ones, + unknown_token='') unk_vec2 = my_embed2[''] assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) unk_vec2 = my_embed2[''] assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1])) my_embed3 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, - init_unknown_vec=nd.ones, unknown_token='') + init_unknown_vec=nd.ones, + unknown_token='') unk_vec3 = my_embed3[''] assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5])) unk_vec3 = my_embed3[''] @@ -256,146 +408,30 @@ def test_custom_embed(): _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, invalid_pretrain_file) pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file) - assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, elem_delim) + assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, + elem_delim) invalid_pretrain_file2 = 'invalid_pretrain_file2.txt' _mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, invalid_pretrain_file2) pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file2) - assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, elem_delim) + assertRaises(AssertionError, text.embedding.TokenEmbedding.from_file, pretrain_file_path, + elem_delim) -def test_vocabulary(): - counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - - v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None) - assert len(v1) == 5 - assert v1.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4} - assert v1.idx_to_token[1] == 'c' - assert v1.unknown_token == '' - assert v1.reserved_tokens is None - - v2 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', - reserved_tokens=None) - assert len(v2) == 3 - assert v2.token_to_idx == {'': 0, 'c': 1, 'b': 2} - assert v2.idx_to_token[1] == 'c' - assert v2.unknown_token == '' - assert v2.reserved_tokens is None - - v3 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', - reserved_tokens=None) - assert len(v3) == 1 - assert v3.token_to_idx == {'': 0} - assert v3.idx_to_token[0] == '' - assert v3.unknown_token == '' - assert v3.reserved_tokens is None - - v4 = text.Vocabulary(counter, max_size=2, min_freq=1, unknown_token='', - reserved_tokens=None) - assert len(v4) == 3 - assert v4.token_to_idx == {'': 0, 'c': 1, 'b': 2} - assert v4.idx_to_token[1] == 'c' - assert v4.unknown_token == '' - assert v4.reserved_tokens is None - - v5 = text.Vocabulary(counter, max_size=3, min_freq=1, unknown_token='', - reserved_tokens=None) - assert len(v5) == 4 - assert v5.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3} - assert v5.idx_to_token[1] == 'c' - assert v5.unknown_token == '' - assert v5.reserved_tokens is None - - v6 = text.Vocabulary(counter, max_size=100, min_freq=1, unknown_token='', - reserved_tokens=None) - assert len(v6) == 5 - assert v6.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, - 'some_word$': 4} - assert v6.idx_to_token[1] == 'c' - assert v6.unknown_token == '' - assert v6.reserved_tokens is None - - v7 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', - reserved_tokens=None) - assert len(v7) == 2 - assert v7.token_to_idx == {'': 0, 'c': 1} - assert v7.idx_to_token[1] == 'c' - assert v7.unknown_token == '' - assert v7.reserved_tokens is None - - assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, - min_freq=0, unknown_token='', reserved_tokens=['b']) - - assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, - min_freq=1, unknown_token='', reserved_tokens=['b', 'b']) - - assertRaises(AssertionError, text.Vocabulary, counter, max_size=None, - min_freq=1, unknown_token='', reserved_tokens=['b', '']) - - v8 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=['b']) - assert len(v8) == 5 - assert v8.token_to_idx == {'': 0, 'b': 1, 'c': 2, 'a': 3, 'some_word$': 4} - assert v8.idx_to_token[1] == 'b' - assert v8.unknown_token == '' - assert v8.reserved_tokens == ['b'] - - v9 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='', - reserved_tokens=['b', 'a']) - assert len(v9) == 4 - assert v9.token_to_idx == {'': 0, 'b': 1, 'a': 2, 'c': 3} - assert v9.idx_to_token[1] == 'b' - assert v9.unknown_token == '' - assert v9.reserved_tokens == ['b', 'a'] - - v10 = text.Vocabulary(counter, max_size=None, min_freq=100, unknown_token='', - reserved_tokens=['b', 'c']) - assert len(v10) == 3 - assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2} - assert v10.idx_to_token[1] == 'b' - assert v10.unknown_token == '' - assert v10.reserved_tokens == ['b', 'c'] - - v11 = text.Vocabulary(counter, max_size=1, min_freq=2, unknown_token='', - reserved_tokens=['', 'b']) - assert len(v11) == 4 - assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3} - assert v11.idx_to_token[1] == '' - assert v11.unknown_token == '' - assert v11.reserved_tokens == ['', 'b'] - - v12 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='b', - reserved_tokens=['']) - assert len(v12) == 3 - assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2} - assert v12.idx_to_token[1] == '' - assert v12.unknown_token == 'b' - assert v12.reserved_tokens == [''] +def test_embedding_get_and_pretrain_file_names(): + assert len(text.embedding.get_pretrained_file_names(embedding_name='fasttext')) == 327 + assert len(text.embedding.get_pretrained_file_names(embedding_name='glove')) == 10 - v13 = text.Vocabulary(counter, max_size=None, min_freq=2, unknown_token='a', - reserved_tokens=['']) - assert len(v13) == 4 - assert v13.token_to_idx == {'a': 0, '': 1, 'c': 2, 'b': 3} - assert v13.idx_to_token[1] == '' - assert v13.unknown_token == 'a' - assert v13.reserved_tokens == [''] + reg = text.embedding.get_pretrained_file_names(embedding_name=None) - counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), ('c', 'c'), ('c', 'c'), ('c', 'c'), - ('some_word$', 'some_word$')]) + assert len(reg['glove']) == 10 + assert len(reg['fasttext']) == 327 - v14 = text.Vocabulary(counter_tuple, max_size=None, min_freq=1, - unknown_token=('', ''), reserved_tokens=None) - assert len(v14) == 5 - assert v14.token_to_idx == {('', ''): 0, ('c', 'c'): 1, ('b', 'b'): 2, ('a', 'a'): 3, - ('some_word$', 'some_word$'): 4} - assert v14.idx_to_token[1] == ('c', 'c') - assert v14.unknown_token == ('', '') - assert v14.reserved_tokens is None + assertRaises(KeyError, text.embedding.get_pretrained_file_names, 'unknown$$') -def test_custom_embedding_with_vocabulary(): +def test_vocab_set_embedding_with_one_custom_embedding(): embed_root = 'embedding' embed_name = 'my_embed' elem_delim = '\t' @@ -413,10 +449,9 @@ def test_custom_embedding_with_vocabulary(): e1 = text.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim, init_unknown_vec=nd.ones) + assert v1.embedding is None v1.set_embedding(e1) - - assert v1.embedding._token_to_idx == {'': 0, '': 1, 'c': 2, 'b': 3, 'a': 4, 'some_word$': 5} - assert v1.embedding._idx_to_token == ['', '', 'c', 'b', 'a', 'some_word$'] + assert v1.embedding is not None assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), np.array([[1, 1, 1, 1, 1], @@ -464,13 +499,6 @@ def test_custom_embedding_with_vocabulary(): [1, 1, 1, 1, 1]]) ) - assertRaises(ValueError, e1.__setitem__, 'unknown$$$', nd.array([0, 0, 0, 0, 0])) - - assertRaises(AssertionError, e1.__setitem__, '', - nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])) - - assertRaises(AssertionError, e1.__setitem__, '', nd.array([0])) - v1.embedding[''] = nd.array([0, 0, 0, 0, 0]) assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), np.array([[0, 0, 0, 0, 0], @@ -491,7 +519,7 @@ def test_custom_embedding_with_vocabulary(): ) -def test_composite_embedding_with_two_embeddings(): +def test_vocabulary_with_two_custom_embeddings(): embed_root = '.' embed_name = 'my_embed' elem_delim = '\t' @@ -511,11 +539,8 @@ def test_composite_embedding_with_two_embeddings(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None) - v1.set_embedding(my_embed1, my_embed2) - - assert v1._token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4} - assert v1._idx_to_token == ['', 'c', 'b', 'a', 'some_word$'] + reserved_tokens=None, embeddings=[my_embed1, my_embed2]) + assert v1.embedding is not None assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], @@ -564,7 +589,7 @@ def test_composite_embedding_with_two_embeddings(): v2 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', reserved_tokens=None) - v2.set_embedding(my_embed3, my_embed4) + v2.set_embedding([my_embed3, my_embed4]) assert_almost_equal(v2.embedding.idx_to_vec.asnumpy(), np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 0.11, 0.12, 0.13, 0.14, 0.15], @@ -580,7 +605,7 @@ def test_composite_embedding_with_two_embeddings(): v3 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', reserved_tokens=None) - v3.set_embedding(my_embed3, my_embed4) + v3.set_embedding([my_embed3, my_embed4]) assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(), np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 0.11, 0.12, 0.13, 0.14, 0.15], @@ -596,7 +621,7 @@ def test_composite_embedding_with_two_embeddings(): v4 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', reserved_tokens=None) - v4.set_embedding(my_embed3, my_embed4) + v4.set_embedding([my_embed3, my_embed4]) assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(), np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 0.11, 0.12, 0.13, 0.14, 0.15], @@ -614,7 +639,7 @@ def test_composite_embedding_with_two_embeddings(): v5 = text.Vocabulary(counter2, max_size=None, min_freq=1, unknown_token='a', reserved_tokens=None) - v5.set_embedding(my_embed3, my_embed4) + v5.set_embedding([my_embed3, my_embed4]) assert v5.embedding._token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3} assert v5.embedding._idx_to_token == ['a', 'c', 'b', 'some_word$'] assert_almost_equal(v5.embedding.idx_to_vec.asnumpy(), @@ -629,18 +654,28 @@ def test_composite_embedding_with_two_embeddings(): ) -def test_get_and_pretrain_file_names(): - assert len(text.embedding.get_pretrained_file_names( - embedding_name='fasttext')) == 327 +def test_download_embed(): + @text.embedding.register + class Test(text.TokenEmbedding): + # 33 bytes. + pretrained_file_name_sha1 = \ + {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'} + namespace = 'test' - assert len(text.embedding.get_pretrained_file_names(embedding_name='glove')) == 10 + def __init__(self, embedding_root='embedding', init_unknown_vec=nd.zeros, **kwargs): + pretrained_file_name = 'embedding_test.vec' + Test._check_pretrained_file_names(pretrained_file_name) - reg = text.embedding.get_pretrained_file_names(embedding_name=None) + super(Test, self).__init__(**kwargs) - assert len(reg['glove']) == 10 - assert len(reg['fasttext']) == 327 + pretrained_file_path = Test._get_pretrained_file(embedding_root, pretrained_file_name) - assertRaises(KeyError, text.embedding.get_pretrained_file_names, 'unknown$$') + self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) + + test_embed = text.embedding.create('test') + assert_almost_equal(test_embed['hello'].asnumpy(), (nd.arange(5) + 1).asnumpy()) + assert_almost_equal(test_embed['world'].asnumpy(), (nd.arange(5) + 6).asnumpy()) + assert_almost_equal(test_embed[''].asnumpy(), nd.zeros((5,)).asnumpy()) if __name__ == '__main__': From 5797aabd5dc134d646fddb1f35c46952da2e4dd8 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Mon, 12 Mar 2018 09:14:08 -0700 Subject: [PATCH 07/20] api implementation done --- python/mxnet/text/_constants.py | 2 +- python/mxnet/text/embedding.py | 257 ++++++++++++++++------------- python/mxnet/text/utils.py | 6 +- python/mxnet/text/vocab.py | 74 +++++---- tests/python/unittest/test_text.py | 38 ++--- 5 files changed, 208 insertions(+), 169 deletions(-) diff --git a/python/mxnet/text/_constants.py b/python/mxnet/text/_constants.py index 74530e05e779..3457a294e5da 100644 --- a/python/mxnet/text/_constants.py +++ b/python/mxnet/text/_constants.py @@ -17,7 +17,7 @@ # coding: utf-8 -"""Read text files and load embedding.""" +"""Constants.""" from __future__ import absolute_import from __future__ import print_function diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py index 12d46b5c18c4..482dbe3c2c5b 100644 --- a/python/mxnet/text/embedding.py +++ b/python/mxnet/text/embedding.py @@ -38,8 +38,12 @@ def register(embedding_cls): """Registers a new token embedding. + + Once an embedding is registered, we can create an instance of this embedding with - :func:`~mxnet.contrib.text.embedding.create`. + :func:`~mxnet.text.embedding.create`. + + Examples -------- >>> @mxnet.contrib.text.embedding.register @@ -57,17 +61,22 @@ def register(embedding_cls): def create(embedding_name, **kwargs): """Creates an instance of token embedding. + + Creates a token embedding instance by loading embedding vectors from an externally hosted pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid - `embedding_name` and `pretrained_file_name`, use - `mxnet.contrib.text.embedding.get_pretrained_file_names()`. + `embedding_name` and `file_name`, use `mxnet.text.embedding.get_file_names()`. + + Parameters ---------- embedding_name : str The token embedding name (case-insensitive). + + Returns ------- - An instance of `mxnet.contrib.text.glossary.TokenEmbedding`: + An instance of `mxnet.text.embedding.TokenEmbedding`: A token embedding instance that loads embedding vectors from an externally hosted pre-trained token embedding file. """ @@ -76,27 +85,31 @@ def create(embedding_name, **kwargs): return create_text_embedding(embedding_name, **kwargs) -def get_pretrained_file_names(embedding_name=None): +def get_file_names(embedding_name=None): """Get valid token embedding names and their pre-trained file names. + + To load token embedding vectors from an externally hosted pre-trained token embedding file, - such as those of GloVe and FastText, one should use - `mxnet.contrib.text.embedding.create(embedding_name, pretrained_file_name)`. - This method returns all the valid names of `pretrained_file_name` for the specified + such as those of GloVe and FastText, one should use `mxnet.text.embedding.create(embedding_name, + file_name)`. This method returns all the valid names of `file_name` for the specified `embedding_name`. If `embedding_name` is set to None, this method returns all the valid - names of `embedding_name` with their associated `pretrained_file_name`. + names of `embedding_name` with their associated `file_name`. + + Parameters ---------- embedding_name : str or None, default None The pre-trained token embedding name. + + Returns ------- dict or list: - A list of all the valid pre-trained token embedding file names (`pretrained_file_name`) + A list of all the valid pre-trained token embedding file names (`file_name`) for the specified token embedding name (`embedding_name`). If the text embeding name is set to None, returns a dict mapping each valid token embedding name to a list of valid - pre-trained files (`pretrained_file_name`). They can be plugged into - `mxnet.contrib.text.embedding.create(embedding_name, - pretrained_file_name)`. + pre-trained files (`file_name`). They can be plugged into + `mxnet.text.embedding.create(embedding_name, file_name)`. """ text_embedding_reg = registry.get_registry(TokenEmbedding) @@ -104,9 +117,8 @@ def get_pretrained_file_names(embedding_name=None): if embedding_name is not None: if embedding_name not in text_embedding_reg: raise KeyError('Cannot find `embedding_name` %s. Use ' - '`get_pretrained_file_names(' - 'embedding_name=None).keys()` to get all the valid embedding ' - 'names.' % embedding_name) + '`get_file_names(embedding_name=None).keys()` to get all the valid' + 'embedding names.' % embedding_name) return list(text_embedding_reg[embedding_name].pretrained_file_name_sha1.keys()) else: return {embedding_name: list(embedding_cls.pretrained_file_name_sha1.keys()) @@ -118,52 +130,42 @@ class TokenEmbedding(object): To load token embedding from an externally hosted pre-trained token embedding file, such as - those of GloVe and FastText, use - :func:`~mxnet.contrib.text.embedding.create(embedding_name, pretrained_file_name)`. - To get all the available `embedding_name` and `pretrained_file_name`, use - :func:`~mxnet.contrib.text.embedding.get_pretrained_file_names()`. + those of GloVe and FastText, use :func:`~mxnet.text.embedding.create(embedding_name, + file_name)`. To get all the available `embedding_name` and `file_name`, use + :func:`~mxnet.text.embedding.get_file_names()`. + Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use - :class:`~mxnet.contrib.text.embedding.CustomEmbedding`. + :func:`~mxnet.text.embedding.from_file()`. + For every unknown token, if its representation `self.unknown_token` is encountered in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the token embedding vector initialized by `init_unknown_vec`. + If a token is encountered multiple times in the pre-trained token embedding file, only the first-encountered token embedding vector will be loaded and the rest will be skipped. + + Parameters ---------- unknown_token : hashable object, default '' The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None, default None - A list of reserved tokens that will always be indexed. + + Properties ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices are aligned. + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each token's index to an - embedding vector. The largest valid index maps to the initialized embedding vector for every - reserved token, such as an unknown_token token and a padding token. """ - def __init__(self, unknown_token='', reserved_tokens=None): + + def __init__(self, unknown_token=''): self._unknown_token = unknown_token - # Thus, constants.UNKNOWN_IDX must be 0. self._idx_to_token = [unknown_token] - - if reserved_tokens is None: - self._reserved_tokens = None - else: - self._reserved_tokens = reserved_tokens[:] - self._idx_to_token.extend(reserved_tokens) - self._token_to_idx = {token: idx for idx, token in enumerate(self._idx_to_token)} self._idx_to_vec = None @@ -211,11 +213,14 @@ def _get_pretrained_file(cls, embedding_root, pretrained_file_name): return pretrained_file_path def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, encoding='utf8'): - """Load embedding vectors from the pre-trained token embedding file. + """Load embedding vectors from a pre-trained token embedding file. + + For every unknown token, if its representation `self.unknown_token` is encountered in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the text embedding vector initialized by `init_unknown_vec`. + If a token is encountered multiple times in the pre-trained text embedding file, only the first-encountered token embedding vector will be loaded and the rest will be skipped. """ @@ -223,8 +228,8 @@ def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, en pretrained_file_path = os.path.expanduser(pretrained_file_path) if not os.path.isfile(pretrained_file_path): - raise ValueError('`pretrained_file_path` must be a valid path to ' - 'the pre-trained token embedding file.') + raise ValueError('`pretrained_file_path` must be a valid path to the pre-trained ' + 'token embedding file.') logging.info('Loading pre-trained token embedding vectors from %s', pretrained_file_path) vec_len = None @@ -237,7 +242,7 @@ def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, en line_num += 1 elems = line.rstrip().split(elem_delim) - assert len(elems) > 1, 'At line %d of the pre-trained text embedding file: the ' \ + assert len(elems) > 1, 'At line %d of the pre-trained token embedding file: the ' \ 'data format of the pre-trained token embedding file %s ' \ 'is unexpected.' % (line_num, pretrained_file_path) @@ -252,14 +257,14 @@ def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, en 'embedding for the same token is seen and skipped.' % (line_num, token)) elif len(elems) == 1: - warnings.warn('At line %d of the pre-trained text embedding file: token %s ' + warnings.warn('At line %d of the pre-trained token embedding file: token %s ' 'with 1-dimensional vector %s is likely a header and is ' 'skipped.' % (line_num, token, elems)) else: if vec_len is None: vec_len = len(elems) # Reserve a vector slot for the unknown token at the very beggining because - # the unknown index is 0. + # the unknown token index is 0. all_elems.extend([0] * vec_len) else: assert len(elems) == vec_len, \ @@ -287,19 +292,19 @@ def idx_to_vec(self): def unknown_token(self): return self._unknown_token - @property - def reserved_tokens(self): - return self._reserved_tokens - def __contains__(self, x): return x in self._token_to_idx def __getitem__(self, tokens): - """Look up embedding vectors of tokens. + """Looks up embedding vectors of text tokens. + + Parameters ---------- tokens : str or list of strs A token or a list of tokens. + + Returns ------- mxnet.ndarray.NDArray: @@ -321,6 +326,8 @@ def __getitem__(self, tokens): def __setitem__(self, tokens, new_vectors): """Updates embedding vectors for tokens. + + Parameters ---------- tokens : str or a list of strs @@ -362,36 +369,43 @@ def __setitem__(self, tokens, new_vectors): self._idx_to_vec[nd.array(indices)] = new_vectors @classmethod - def _check_pretrained_file_names(cls, pretrained_file_name): + def _check_pretrained_file_names(cls, file_name): """Checks if a pre-trained token embedding file name is valid. + + Parameters ---------- - pretrained_file_name : str + file_name : str The pre-trained token embedding file. """ embedding_name = cls.__name__.lower() - if pretrained_file_name not in cls.pretrained_file_name_sha1: - raise KeyError('Cannot find pretrained file %s for token embedding %s. Valid ' - 'pretrained files for embedding %s: %s' % - (pretrained_file_name, embedding_name, embedding_name, + if file_name not in cls.pretrained_file_name_sha1: + raise KeyError('Cannot find pre-trained file %s for token embedding %s. Valid ' + 'pre-trained file names for embedding %s: %s' % + (file_name, embedding_name, embedding_name, ', '.join(cls.pretrained_file_name_sha1.keys()))) @staticmethod - def from_file(pretrained_file_path, elem_delim=' ', encoding='utf8', - init_unknown_vec=nd.zeros, **kwargs): - """Load user-defined token embedding. - This is to load embedding vectors from a user-defined pre-trained text embedding file. - Denote by '[ed]' the argument `elem_delim`. Denote by [v_ij] the j-th element of the token - embedding vector for [token_i], the expected format of a custom pre-trained token embedding file - is: - '[token_1][ed][v_11][ed][v_12][ed]...[ed][v_1k]\\\\n[token_2][ed][v_21][ed][v_22][ed]...[ed] - [v_2k]\\\\n...' + def from_file(file_path, elem_delim=' ', encoding='utf8', init_unknown_vec=nd.zeros, **kwargs): + """Creates a user-defined token embedding from a pre-trained embedding file. + + + This is to load embedding vectors from a user-defined pre-trained token embedding file. + Denote by '(ed)' the argument `elem_delim`. Denote by (v_ij) the j-th element of the token + embedding vector for (token_i), the expected format of a custom pre-trained token embedding + file is: + + '(token_1)(ed))v_11)(ed)(v_12)(ed)...(ed)(v_1k)\\\\n + (token_2)(ed)(v_21)(ed)(v_22)(ed)...(ed)(v_2k)\\\\n...' + where k is the length of the embedding vector `vec_len`. + + Parameters ---------- - pretrained_file_path : str - The path to the custom pre-trained token embedding file. + file_path : str + The path to the user-defined pre-trained token embedding file. elem_delim : str, default ' ' The delimiter for splitting a token and every embedding vector element value on the same line of the custom pre-trained token embedding file. @@ -399,9 +413,15 @@ def from_file(pretrained_file_path, elem_delim=' ', encoding='utf8', The encoding scheme for reading the custom pre-trained token embedding file. init_unknown_vec : callback The callback used to initialize the embedding vector for the unknown token. + + + Returns + ------- + instance of `~mxnet.text.embedding.TokenEmbedding` + The user-defined token embedding instance. """ embedding = TokenEmbedding(**kwargs) - embedding._load_embedding(pretrained_file_path, elem_delim, init_unknown_vec, encoding) + embedding._load_embedding(file_path, elem_delim, init_unknown_vec, encoding) return embedding @@ -409,48 +429,49 @@ def from_file(pretrained_file_path, elem_delim=' ', encoding='utf8', @register class GloVe(TokenEmbedding): """The GloVe word embedding. + + GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. (Source from https://nlp.stanford.edu/projects/glove/) + Reference: + GloVe: Global Vectors for Word Representation. Jeffrey Pennington, Richard Socher, and Christopher D. Manning. https://nlp.stanford.edu/pubs/glove.pdf + Website: + https://nlp.stanford.edu/projects/glove/ + To get the updated URLs to the externally hosted pre-trained token embedding files, visit https://nlp.stanford.edu/projects/glove/ + License for pre-trained embedding: - https://opendatacommons.org/licenses/pddl/ + + https://opendatacommons.org/licenses/pddl/ + + Parameters ---------- - pretrained_file_name : str, default 'glove.840B.300d.txt' + file_name : str, default 'glove.6B.50d.txt' The name of the pre-trained token embedding file. embedding_root : str, default os.path.join('~', '.mxnet', 'embedding') The root directory for storing embedding-related files. init_unknown_vec : callback The callback used to initialize the embedding vector for the unknown token. - vocabulary : :class:`~mxnet.contrib.text.vocab.Vocabulary`, default None - It contains the tokens to index. Each indexed token will be associated with the loaded - embedding vectors, such as loaded from a pre-trained token embedding file. If None, all the - tokens from the loaded embedding vectors, such as loaded from a pre-trained token embedding - file, will be indexed. + + Properties ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices are aligned. + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each token's index to an - embedding vector. The largest valid index maps to the initialized embedding vector for every - reserved token, such as an unknown_token token and a padding token. """ # Map a pre-trained token embedding archive file and its SHA-1 hash. @@ -460,20 +481,20 @@ class GloVe(TokenEmbedding): pretrained_file_name_sha1 = C.GLOVE_PRETRAINED_ARCHIVE_SHA1 @classmethod - def _get_download_file_name(cls, pretrained_file_name): + def _get_download_file_name(cls, file_name): # Map a pre-trained embedding file to its archive to download. src_archive = {archive.split('.')[1]: archive for archive in GloVe.pretrained_archive_name_sha1.keys()} - archive = src_archive[pretrained_file_name.split('.')[1]] + archive = src_archive[file_name.split('.')[1]] return archive - def __init__(self, pretrained_file_name='glove.840B.300d.txt', + def __init__(self, file_name='glove.6B.50d.txt', embedding_root=os.path.join('~', '.mxnet', 'embedding'), init_unknown_vec=nd.zeros, **kwargs): - GloVe._check_pretrained_file_names(pretrained_file_name) + GloVe._check_pretrained_file_names(file_name) super(GloVe, self).__init__(**kwargs) - pretrained_file_path = GloVe._get_pretrained_file(embedding_root, pretrained_file_name) + pretrained_file_path = GloVe._get_pretrained_file(embedding_root, file_name) self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) @@ -481,54 +502,62 @@ def __init__(self, pretrained_file_name='glove.840B.300d.txt', @register class FastText(TokenEmbedding): """The fastText word embedding. + + FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/) + + References: + Enriching Word Vectors with Subword Information. Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. https://arxiv.org/abs/1607.04606 + Bag of Tricks for Efficient Text Classification. Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov. https://arxiv.org/abs/1607.01759 + FastText.zip: Compressing text classification models. - Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, - and Tomas Mikolov. + Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, and Tomas Mikolov. https://arxiv.org/abs/1612.03651 + For 'wiki.multi' embedding: Word Translation Without Parallel Data - Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, - and Herve Jegou. + Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou. https://arxiv.org/abs/1710.04087 + Website: + https://fasttext.cc/ + To get the updated URLs to the externally hosted pre-trained token embedding files, visit https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md + License for pre-trained embedding: - https://creativecommons.org/licenses/by-sa/3.0/ + + https://creativecommons.org/licenses/by-sa/3.0/ + + Parameters ---------- - pretrained_file_name : str, default 'wiki.en.vec' + file_name : str, default 'glove.6B.50d.txt' The name of the pre-trained token embedding file. embedding_root : str, default os.path.join('~', '.mxnet', 'embedding') The root directory for storing embedding-related files. init_unknown_vec : callback The callback used to initialize the embedding vector for the unknown token. + + Properties ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. - idx_to_token : list of strs - A list of indexed tokens where the list indices and the token indices are aligned. + idx_to_vec : mxnet.ndarray.NDArray + For all the indexed tokens in this embedding, this NDArray maps each token's index to an + embedding vector. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. - idx_to_vec : mxnet.ndarray.NDArray - For all the indexed tokens in this embedding, this NDArray maps each token's index to an - embedding vector. The largest valid index maps to the initialized embedding vector for every - reserved token, such as an unknown_token token and a padding token. """ # Map a pre-trained token embedding archive file and its SHA-1 hash. @@ -538,16 +567,16 @@ class FastText(TokenEmbedding): pretrained_file_name_sha1 = C.FAST_TEXT_FILE_SHA1 @classmethod - def _get_download_file_name(cls, pretrained_file_name): + def _get_download_file_name(cls, file_name): # Map a pre-trained embedding file to its archive to download. - return '.'.join(pretrained_file_name.split('.')[:-1])+'.zip' + return '.'.join(file_name.split('.')[:-1]) + '.zip' - def __init__(self, pretrained_file_name='wiki.simple.vec', + def __init__(self, file_name='wiki.simple.vec', embedding_root=os.path.join('~', '.mxnet', 'embedding'), init_unknown_vec=nd.zeros, **kwargs): - FastText._check_pretrained_file_names(pretrained_file_name) + FastText._check_pretrained_file_names(file_name) super(FastText, self).__init__(**kwargs) - pretrained_file_path = FastText._get_pretrained_file(embedding_root, pretrained_file_name) + pretrained_file_path = FastText._get_pretrained_file(embedding_root, file_name) self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) diff --git a/python/mxnet/text/utils.py b/python/mxnet/text/utils.py index d167310bdf40..a812be8c2df2 100644 --- a/python/mxnet/text/utils.py +++ b/python/mxnet/text/utils.py @@ -17,7 +17,7 @@ # coding: utf-8 -"""Provide utilities for text data processing.""" +"""Utilities for text data processing.""" from __future__ import absolute_import from __future__ import print_function @@ -30,10 +30,10 @@ def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n', to_lower= """Counts tokens in the specified string. - For token_delim='' and seq_delim='', a specified string of two sequences of tokens may + For token_delim='(td)' and seq_delim='(sd)', a specified string of two sequences of tokens may look like:: - token1token2token3token4token5 + (td)token1(td)token2(td)token3(td)(sd)(td)token4(td)token5(td)(sd) Parameters diff --git a/python/mxnet/text/vocab.py b/python/mxnet/text/vocab.py index d31e2640abbc..8011521ff491 100644 --- a/python/mxnet/text/vocab.py +++ b/python/mxnet/text/vocab.py @@ -30,7 +30,7 @@ class Vocabulary(object): - """Vocabulary for indexing text tokens and access embedding. + """Indexing and embedding assignment for text tokens. Parameters @@ -50,7 +50,7 @@ class Vocabulary(object): argument has no effect. min_freq : int, default 1 The minimum frequency required for a token in the keys of `counter` to be indexed. - unknown_token : hashable object, default '<unk>' + unknown_token : hashable object, default '' The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. Keys of `counter`, `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. @@ -59,23 +59,28 @@ class Vocabulary(object): padding, beginning of sentence, and end of sentence. It cannot contain `unknown_token`, or duplicate reserved tokens. Keys of `counter`, `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. + embedding : instance or list of instances of `embedding.TokenEmbedding`, default None + The embedding to be assigned to the indexed tokens. If a list of multiple embeddings are + provided, their embedding vectors will be concatenated for the same token. Properties ---------- - token_to_idx : dict mapping str to int - A dict mapping each token to its index integer. + embedding : instance of `~mxnet.text.embedding.TokenEmbedding` + The embedding of the indexed tokens. idx_to_token : list of strs A list of indexed tokens where the list indices and the token indices are aligned. + reserved_tokens : list of strs or None + A list of reserved tokens that will always be indexed. + token_to_idx : dict mapping str to int + A dict mapping each token to its index integer. unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. - reserved_tokens : list of strs or None - A list of reserved tokens that will always be indexed. """ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None, embeddings=None): + reserved_tokens=None, embedding=None): # Sanity checks. assert min_freq > 0, '`min_freq` must be set to a positive value.' @@ -92,10 +97,10 @@ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token=' if counter is not None: self._index_counter_keys(counter, unknown_token, reserved_tokens, max_size, min_freq) - if embeddings is None: + if embedding is None: self._embedding = None else: - self.set_embedding(embeddings) + self.set_embedding(embedding) def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens): """Indexes unknown and reserved tokens.""" @@ -114,6 +119,8 @@ def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens): def _index_counter_keys(self, counter, unknown_token, reserved_tokens, max_size, min_freq): """Indexes keys of `counter`. + + Indexes keys of `counter` according to frequency thresholds such as `max_size` and `min_freq`. """ @@ -157,38 +164,46 @@ def token_to_idx(self): def unknown_token(self): return self._unknown_token - def __contains__(self, s): - """Check whether token exists in the vocabulary. + def __contains__(self, token): + """Checks whether a text token exists in the vocabulary. + + Parameters ---------- - s : str - A token. + token : str + A text token. + + Returns ------- - int or list of ints - A token index or a list of token indices according to the vocabulary. + bool + Whether the text token exists in the vocabulary (including `unknown_token`). """ - return s in self._token_to_idx + return token in self._token_to_idx + + def __getitem__(self, tokens): + """Looks up indices of text tokens according to the vocabulary. + - def __getitem__(self, s): - """Converts token/tokens to indices according to the vocabulary. Parameters ---------- - s : str or list of strs + tokens : str or list of strs A source token or tokens to be converted. + + Returns ------- int or list of ints A token index or a list of token indices according to the vocabulary. """ - if not isinstance(s, (list, tuple)): - return self._token_to_idx[s] if s in self._token_to_idx \ + if not isinstance(tokens, (list, tuple)): + return self._token_to_idx[tokens] if tokens in self._token_to_idx \ else C.UNKNOWN_IDX else: return [self._token_to_idx[token] if token in self._token_to_idx - else C.UNKNOWN_IDX for token in s] + else C.UNKNOWN_IDX for token in tokens] def __len__(self): return len(self._idx_to_token) @@ -200,13 +215,12 @@ def set_embedding(self, embeddings): for embedding in embeddings: assert isinstance(embedding, TokenEmbedding), \ - 'The argument `embedding` must be an instance or a list of instances ' \ - 'of `mxnet.contrib.text.embedding.TextEmbedding` whose embedding vectors will be' \ - 'loaded or concatenated-then-loaded to map to the indexed tokens.' + 'The argument `embeddings` must be an instance or a list of instances of ' \ + '`mxnet.text.embedding.TokenEmbedding`.' - new_embedding = TokenEmbedding(self._unknown_token, self._reserved_tokens) - new_embedding._token_to_idx = self._token_to_idx - new_embedding._idx_to_token = self._idx_to_token + new_embedding = TokenEmbedding(self.unknown_token) + new_embedding._token_to_idx = self.token_to_idx + new_embedding._idx_to_token = self.idx_to_token new_vec_len = sum(embedding.idx_to_vec.shape[1] for embedding in embeddings if embedding and embedding.idx_to_vec is not None) @@ -227,10 +241,14 @@ def set_embedding(self, embeddings): def to_tokens(self, indices): """Converts token indices to tokens according to the vocabulary. + + Parameters ---------- indices : int or list of ints A source token index or token indices to be converted. + + Returns ------- str or list of strs diff --git a/tests/python/unittest/test_text.py b/tests/python/unittest/test_text.py index f4adff75ee9b..00e5175488d5 100644 --- a/tests/python/unittest/test_text.py +++ b/tests/python/unittest/test_text.py @@ -39,29 +39,25 @@ def _get_test_str_of_tokens(token_delim, seq_delim): def _test_count_tokens_from_str_with_delims(token_delim, seq_delim): source_str = _get_test_str_of_tokens(token_delim, seq_delim) - cnt1 = text.utils.count_tokens_from_str( - source_str, token_delim, seq_delim, to_lower=False) + cnt1 = text.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=False) assert cnt1 == Counter( {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) - cnt2 = text.utils.count_tokens_from_str( - source_str, token_delim, seq_delim, to_lower=True) + cnt2 = text.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=True) assert cnt2 == Counter( {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) counter_to_update = Counter({'life': 2}) - cnt3 = text.utils.count_tokens_from_str( - source_str, token_delim, seq_delim, to_lower=False, - counter_to_update=counter_to_update.copy()) + cnt3 = text.utils.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=False, + counter_to_update=counter_to_update.copy()) assert cnt3 == Counter( {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) - cnt4 = text.utils.count_tokens_from_str( - source_str, token_delim, seq_delim, to_lower=True, - counter_to_update=counter_to_update.copy()) + cnt4 = text.count_tokens_from_str(source_str, token_delim, seq_delim, to_lower=True, + counter_to_update=counter_to_update.copy()) assert cnt4 == Counter( {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) @@ -357,7 +353,6 @@ def test_token_embedding_from_file(): assert 'a' in my_embed assert my_embed.unknown_token == '' - assert my_embed.reserved_tokens is None assert my_embed.unknown_token in my_embed first_vec = my_embed.idx_to_vec[0] @@ -420,15 +415,15 @@ def test_token_embedding_from_file(): def test_embedding_get_and_pretrain_file_names(): - assert len(text.embedding.get_pretrained_file_names(embedding_name='fasttext')) == 327 - assert len(text.embedding.get_pretrained_file_names(embedding_name='glove')) == 10 + assert len(text.embedding.get_file_names(embedding_name='fasttext')) == 327 + assert len(text.embedding.get_file_names(embedding_name='glove')) == 10 - reg = text.embedding.get_pretrained_file_names(embedding_name=None) + reg = text.embedding.get_file_names(embedding_name=None) assert len(reg['glove']) == 10 assert len(reg['fasttext']) == 327 - assertRaises(KeyError, text.embedding.get_pretrained_file_names, 'unknown$$') + assertRaises(KeyError, text.embedding.get_file_names, 'unknown$$') def test_vocab_set_embedding_with_one_custom_embedding(): @@ -462,8 +457,6 @@ def test_vocab_set_embedding_with_one_custom_embedding(): [1, 1, 1, 1, 1]]) ) - assert v1.embedding.reserved_tokens == [''] - assert_almost_equal(v1.embedding['c'].asnumpy(), np.array([1, 1, 1, 1, 1]) ) @@ -539,7 +532,7 @@ def test_vocabulary_with_two_custom_embeddings(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None, embeddings=[my_embed1, my_embed2]) + reserved_tokens=None, embedding=[my_embed1, my_embed2]) assert v1.embedding is not None assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), @@ -551,7 +544,6 @@ def test_vocabulary_with_two_custom_embeddings(): [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]) ) - assert v1.embedding.reserved_tokens is None assert_almost_equal(v1.embedding['c'].asnumpy(), np.array([1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1]) ) @@ -663,14 +655,14 @@ class Test(text.TokenEmbedding): namespace = 'test' def __init__(self, embedding_root='embedding', init_unknown_vec=nd.zeros, **kwargs): - pretrained_file_name = 'embedding_test.vec' - Test._check_pretrained_file_names(pretrained_file_name) + file_name = 'embedding_test.vec' + Test._check_pretrained_file_names(file_name) super(Test, self).__init__(**kwargs) - pretrained_file_path = Test._get_pretrained_file(embedding_root, pretrained_file_name) + file_path = Test._get_pretrained_file(embedding_root, file_name) - self._load_embedding(pretrained_file_path, ' ', init_unknown_vec) + self._load_embedding(file_path, ' ', init_unknown_vec) test_embed = text.embedding.create('test') assert_almost_equal(test_embed['hello'].asnumpy(), (nd.arange(5) + 1).asnumpy()) From a2215ca30b8c220a0da4c011b7d1967032bcb7d4 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Mon, 12 Mar 2018 09:22:00 -0700 Subject: [PATCH 08/20] license and news --- LICENSE | 312 ++++++-------------------------------------------------- NEWS.md | 41 -------- 2 files changed, 34 insertions(+), 319 deletions(-) diff --git a/LICENSE b/LICENSE index e7d50c377232..d3b3d6f9dd0f 100644 --- a/LICENSE +++ b/LICENSE @@ -201,145 +201,43 @@ See the License for the specific language governing permissions and limitations under the License. - ====================================================================================== + ======================================================================= Apache MXNET (incubating) Subcomponents: - The Apache MXNET (incubating) project contains subcomponents with separate copyright - notices and license terms. Your use of the source code for the these + The Apache MXNET (incubating) project contains subcomponents with separate + copyright notices and license terms. Your use of the source code for the these subcomponents is subject to the terms and conditions of the following - licenses. - - ======================================================================================= - Apache-2.0 licenses - ======================================================================================= - - The following components are provided under an Apache 2.0 license. - - 1. MXNet Cpp-package - For details, /cpp-package/LICENSE - 2. MXNet rcnn - For details, see, example/rcnn/LICENSE - 3. scala-package - For details, see, scala-package/LICENSE - 4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE - 5. dlpack - For details, see, dlpack/LICENSE - 6. dmlc-core - For details, see, dmlc-core/LICENSE - 7. mshadow - For details, see, mshadow/LICENSE - 8. nnvm/dmlc-core - For details, see, nnvm/dmlc-core/LICENSE - 9. nnvm - For details, see, nnvm/LICENSE - 10. nnvm-fusion - For details, see, nnvm/plugin/nnvm-fusion/LICENSE - 11. ps-lite - For details, see, ps-lite/LICENSE - 12. nnvm/tvm - For details, see, nnvm/tvm/LICENSE - 13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE - - - ======================================================================================= - MIT licenses - ======================================================================================= - - 1. Fast R-CNN - For details, see example/rcnn/LICENSE - 2. Faster R-CNN - For details, see example/rcnn/LICENSE - 3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE - 4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt - 5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE - - - ======================================================================================= - NVIDIA Licenses - ======================================================================================= - - 1. Moderngpu - For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE - - /****************************************************************************** - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - - 2. CUB Library - For details, see, 3rdparty/cub/LICENSE.TXT - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the NVIDIA CORPORATION nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - - - ======================================================================================= - Other Licenses - ======================================================================================= - - 1. Caffe - For details, see, example/rcnn/LICENSE - - LICENSE - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - CONTRIBUTION AGREEMENT - - By contributing to the BVLC/caffe repository through pull-request, comment, - or otherwise, the contributor releases their content to the - license and copyright terms herein. - - ======================================================================================= - - 2. MS COCO API - For details, see, example/rcnn/LICENSE - + licenses - + + ======================================================================== + 1. Apache-2.0 license as above, wherever applicable + ======================================================================== + + ======================================================================== + 2. MIT license wherever applicable + ======================================================================== + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + + ======================================================================== + 3. BSD License wherever applicable + ======================================================================== Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -364,148 +262,6 @@ of the authors and should not be interpreted as representing official policies, either expressed or implied, of the FreeBSD Project. - ======================================================================================= - - 3. Sphinx JavaScript utilties for the full-text search - For details, see, docs/_static/searchtools_custom.js - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ======================================================================================= - - 4. FindCrypto.cmake - For details, see, dmlc-core/cmake/Modules/FindCrypto.cmake, - Redistribution and use is allowed according to the terms of the BSD license. - - ======================================================================================= - - 5. Googlemock - For details, see, 3rdparty/googletest/googlemock/LICENSE - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - * Neither the name of Google Inc. nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ======================================================================================= - - 6. Googletest - For details, see, 3rdparty/googletest/googletest/LICENSE - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - * Neither the name of Google Inc. nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ======================================================================================= - - 7. OpenMP Testsuite - For details, see, 3rdparty/openmp/testsuite/LICENSE - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - o Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - o Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - o Neither the name of the University of Houston System nor the names of its - contributors may be used to - endorse or promote products derived from this software without specific - prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ======================================================================================= - - 8. Semaphore implementation in blockingconcurrentqueue.h - This file uses a semaphore implementation under the terms of its separate zlib license. - For details, see, dmlc-core/include/dmlc/blockingconcurrentqueue.h - - ======================================================================================= - - 9. blockingconcurrentqueue.h - This file is Distributed under the terms of the simplified BSD license. - For details, see, dmlc-core/include/dmlc/blockingconcurrentqueue.h - - ======================================================================================= diff --git a/NEWS.md b/NEWS.md index a51b514c1a51..fc6b10188fc7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,46 +1,5 @@ MXNet Change Log ================ -## 1.1.0 -### Usability Improvements -- Improved the usability of examples and tutorials -### Bug-fixes -- Fixed I/O multiprocessing for too many open file handles (#8904), race condition (#8995), deadlock (#9126). -- Fixed image IO integration with OpenCV 3.3 (#8757). -- Fixed Gluon block printing (#8956). -- Fixed float16 argmax when there is negative input. (#9149) -- Fixed random number generator to ensure sufficient randomness. (#9119, #9256, #9300) -- Fixed custom op multi-GPU scaling (#9283) -- Fixed gradient of gather_nd when duplicate entries exist in index. (#9200) -- Fixed overriden contexts in Module `group2ctx` option when using multiple contexts (#8867) -- Fixed `swap_axes` operator with "add_to" gradient req (#9541) -### New Features -- Added experimental API in `contrib.text` for building vocabulary, and loading pre-trained word embeddings, with built-in support for 307 GloVe and FastText pre-trained embeddings. (#8763) -- Added experimental structural blocks in `gluon.contrib`: `Concurrent`, `HybridConcurrent`, `Identity`. (#9427) -- Added `sparse.dot(dense, csr)` operator (#8938) -- Added `Khatri-Rao` operator (#7781) -- Added `FTML` and `Signum` optimizer (#9220, #9262) -- Added `ENABLE_CUDA_RTC` build option (#9428) -### API Changes -- Added zero gradients to rounding operators including `rint`, `ceil`, `floor`, `trunc`, and `fix` (#9040) -- Added `use_global_stats` in `nn.BatchNorm` (#9420) -- Added `axis` argument to `SequenceLast`, `SequenceMask` and `SequenceReverse` operators (#9306) -- Added `lazy_update` option for standard `SGD` & `Adam` optimizer with `row_sparse` gradients (#9468, #9189) -- Added `select` option in `Block.collect_params` to support regex (#9348) -- Added support for (one-to-one and sequence-to-one) inference on explicit unrolled RNN models in R (#9022) -### Deprecations -- The Scala API name space is still called `ml.dmlc`. The name space is likely be changed in a future release to `org.apache` and might brake existing applications and scripts (#9579, #9324) -### Performance Improvements -- Improved GPU inference speed by 20% when batch size is 1 (#9055) -- Improved `SequenceLast` operator speed (#9306) -- Added multithreading for the class of broadcast_reduce operators on CPU (#9444) -- Improved batching for GEMM/TRSM operators with large matrices on GPU (#8846) -### Known Issues -- "Predict with pre-trained models" tutorial is broken -- "example/numpy-ops/ndarray_softmax.py" is broken - -For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+%28incubating%29+1.1.0+Release+Notes) - - ## 1.0.0 ### Performance - Enhanced the performance of `sparse.dot` operator. From c69cb07a1b0f5e9b5e7a499cb8dcb33498b57ce8 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Mon, 12 Mar 2018 09:25:55 -0700 Subject: [PATCH 09/20] readme and cpp --- README.md | 1 - cpp-package/include/mxnet-cpp/ndarray.h | 12 +-- example/numpy-ops/custom_softmax_rtc.py | 131 +++++++++++------------- src/operator/roi_pooling.cc | 48 ++++----- 4 files changed, 85 insertions(+), 107 deletions(-) diff --git a/README.md b/README.md index dbec65b0f365..57f042d09841 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,6 @@ deep learning systems, and interesting insights of DL systems for hackers. What's New ---------- -* [Version 1.1.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.1.0) - MXNet 1.1.0 Release. * [Version 1.0.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.0.0) - MXNet 1.0.0 Release. * [Version 0.12.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.1) - MXNet 0.12.1 Patch Release. * [Version 0.12.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.0) - MXNet 0.12.0 Release. diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h index 1166643e4e8a..082c06981cf9 100644 --- a/cpp-package/include/mxnet-cpp/ndarray.h +++ b/cpp-package/include/mxnet-cpp/ndarray.h @@ -291,15 +291,15 @@ class NDArray { */ void SyncCopyToCPU(std::vector *data, size_t size = 0); /*! - * \brief copy the content of current array to a target array. - * \param other the target NDArray - * \return the target NDarray + * \brief Copy the content of current array to other. + * \param other the new context of this NDArray + * \return the new copy */ NDArray CopyTo(NDArray * other) const; /*! - * \brief return a new copy to this NDArray - * \param Context the new context of this NDArray - * \return the new copy + * \brief return a new copy this NDArray + * \param other the target NDArray + * \return the copy target NDarray */ NDArray Copy(const Context &) const; /*! diff --git a/example/numpy-ops/custom_softmax_rtc.py b/example/numpy-ops/custom_softmax_rtc.py index d07041b002d3..906cbbeac04c 100644 --- a/example/numpy-ops/custom_softmax_rtc.py +++ b/example/numpy-ops/custom_softmax_rtc.py @@ -23,77 +23,51 @@ class Softmax(mx.operator.CustomOp): def __init__(self): - super(Softmax,self).__init__() - # Each thread processes a row (a sample in the batch). - fwd_src = r""" - template - __global__ void fwd(const DType* x, DType* y, const int row_size, const int req) { - const int offset = row_size * threadIdx.x; - DType max = x[offset]; - for(int i = 1; i < row_size; ++i) { - if(max < x[offset + i]) { - max = x[offset + i]; - } - } - DType sum = 0; - for(int i = 0; i < row_size; ++i) { - sum += exp(x[offset + i] - max); - } - switch(req) { - case 1: - for(int i = 0; i < row_size; ++i) { - y[offset + i] = exp(x[offset + i] - max) / sum; - } - break; - case 2: - for(int i = 0; i < row_size; ++i) { - y[offset + i] += exp(x[offset + i] - max) / sum; - } - break; - } - } - """ - - # Each block processes a row and each thread in a block calculate an element of `dx`. - bwd_src = r""" - template - __global__ void bwd(const DType* l, const DType* y, DType* dx, const int req) { - const int z = static_cast(l[blockIdx.x]); - const int i = threadIdx.x + blockDim.x * blockIdx.x; - if(req == 1) { - dx[i] = threadIdx.x == z ? y[i] - 1 : y[i]; - } else { - dx[i] += threadIdx.x == z ? y[i] - 1 : y[i]; - } - } - """ - fwd_kernel_mod = mx.rtc.CudaModule(fwd_src, exports=["fwd", "fwd"]) - bwd_kernel_mod = mx.rtc.CudaModule(bwd_src, exports=["bwd", "bwd"]) - - fwd_kernel_float_signature = "const float*, const float*, const int, const int" - self.fwd_float_kernel = fwd_kernel_mod.get_kernel("fwd", fwd_kernel_float_signature) - - bwd_kernel_float_signature = "const float*, const float*, float*, const int" - self.bwd_float_kernel = bwd_kernel_mod.get_kernel("bwd", bwd_kernel_float_signature) - - fwd_kernel_double_signature = "const double*, const double*, const int, const int" - self.fwd_double_kernel = fwd_kernel_mod.get_kernel("fwd", fwd_kernel_double_signature) - - bwd_kernel_double_signature = "const double*, const double*, double*, const int" - self.bwd_double_kernel = bwd_kernel_mod.get_kernel("bwd", bwd_kernel_double_signature) + self.fwd_kernel_mod = None + self.bwd_kernel_mod = None + super().__init__() def forward(self, is_train, req, in_data, out_data, aux): if req[0] == "null": return x = in_data[0] # input y = out_data[0] # output - - if y.dtype == np.float64: - # args, ctx, grid_shape, block_shape, shared_mem = 0 - self.fwd_double_kernel.launch((x, y, x.shape[1], self._reqCode(req[0])), mx.gpu(0), (1, 1, 1), (x.shape[0], 1, 1)) - else: - # args, ctx, grid_shape, block_shape, shared_mem = 0 - self.fwd_float_kernel.launch((x, y, x.shape[1], self._reqCode(req[0])), mx.gpu(0), (1, 1, 1), (x.shape[0], 1, 1)) + if self.fwd_kernel_mod is None: + # Each thread processes a row (a sample in the batch). + src = r""" + template + __global__ void fwd(const DType* x, DType* y, const int row_size, const int req) { + const int offset = row_size * threadIdx.x; + DType max = x[offset]; + for(int i = 1; i < row_size; ++i) { + if(max < x[offset + i]) { + max = x[offset + i]; + } + } + DType sum = 0; + for(int i = 0; i < row_size; ++i) { + sum += exp(x[offset + i] - max); + } + switch(req) { + case 1: + for(int i = 0; i < row_size; ++i) { + y[offset + i] = exp(x[offset + i] - max) / sum; + } + break; + case 2: + for(int i = 0; i < row_size; ++i) { + y[offset + i] += exp(x[offset + i] - max) / sum; + } + break; + } + } + """ + self.fwd_kernel_mod = mx.rtc.CudaModule(src, exports=["fwd", "fwd"]) + dtype = "double" if y.dtype == np.float64 else "float" + kernel_signature = "const {0}*, const {0}*, const int, const int".format(dtype) + kernel = self.fwd_kernel_mod.get_kernel("fwd<{}>".format(dtype), kernel_signature) + # args, ctx, grid_shape, block_shape, shared_mem = 0 + kernel.launch((x, y, x.shape[1], self._reqCode(req[0])), mx.gpu(0), (1, 1, 1), (x.shape[0], 1, 1)) def backward(self, req, out_grad, in_data, out_data, in_grad, aux): if req[0] == "null": @@ -101,13 +75,26 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux): l = in_data[1] # label y = out_data[0] # output from the forward pass dx = in_grad[0] # the storage for the gradient - - if dx.dtype == np.float64: - # args, ctx, grid_shape, block_shape, shared_mem = 0 - self.bwd_double_kernel.launch((l, y, dx, self._reqCode(req[0])), mx.gpu(0), (y.shape[0], 1, 1), (y.shape[1], 1, 1)) - else: - # args, ctx, grid_shape, block_shape, shared_mem = 0 - self.bwd_float_kernel.launch((l, y, dx, self._reqCode(req[0])), mx.gpu(0), (y.shape[0], 1, 1), (y.shape[1], 1, 1)) + if self.bwd_kernel_mod is None: + # Each block processes a row and each thread in a block calculate an element of `dx`. + src = r""" + template + __global__ void bwd(const DType* l, const DType* y, DType* dx, const int req) { + const int z = static_cast(l[blockIdx.x]); + const int i = threadIdx.x + blockDim.x * blockIdx.x; + if(req == 1) { + dx[i] = threadIdx.x == z ? y[i] - 1 : y[i]; + } else { + dx[i] += threadIdx.x == z ? y[i] - 1 : y[i]; + } + } + """ + self.bwd_kernel_mod = mx.rtc.CudaModule(src, exports=["bwd", "bwd"]) + dtype = "double" if dx.dtype == np.float64 else "float" + kernel_signature = "const {0}*, const {0}*, {0}*, const int".format(dtype) + kernel = self.bwd_kernel_mod.get_kernel("bwd<{}>".format(dtype), kernel_signature) + # args, ctx, grid_shape, block_shape, shared_mem = 0 + kernel.launch((l, y, dx, self._reqCode(req[0])), mx.gpu(0), (y.shape[0], 1, 1), (y.shape[1], 1, 1)) def _reqCode(self, req): if(req == "write"): diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc index acff1f97dcce..10d1420950cc 100644 --- a/src/operator/roi_pooling.cc +++ b/src/operator/roi_pooling.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file roi_pooling.cc * \brief roi pooling operator - * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo, Xinyu Chen + * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo */ #include "./roi_pooling-inl.h" #include @@ -54,22 +54,13 @@ inline void ROIPoolForward(const Tensor &out, const int num_rois = bbox.size(0); const int data_size = data.size(1) * data.size(2) * data.size(3); - const int data_size_c = data.size(2) * data.size(3); - const int out_size_c = out.size(2) * out.size(3); - const int out_size = channels_ * out_size_c; - const int max_idx_size_c = max_idx.size(2) * max_idx.size(3); - const int max_idx_size = channels_ * max_idx_size_c; // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R for (int n = 0; n < num_rois; ++n) { - // Increment ROI data pointer - const Dtype *bottom_rois_n = bottom_rois + n * bbox.size(1); - Dtype *top_data_n = top_data + n * out_size; - Dtype *argmax_data_n = argmax_data + n * max_idx_size; - int roi_batch_ind = bottom_rois_n[0]; - int roi_start_w = round(bottom_rois_n[1] * spatial_scale_); - int roi_start_h = round(bottom_rois_n[2] * spatial_scale_); - int roi_end_w = round(bottom_rois_n[3] * spatial_scale_); - int roi_end_h = round(bottom_rois_n[4] * spatial_scale_); + int roi_batch_ind = bottom_rois[0]; + int roi_start_w = round(bottom_rois[1] * spatial_scale_); + int roi_start_h = round(bottom_rois[2] * spatial_scale_); + int roi_end_w = round(bottom_rois[3] * spatial_scale_); + int roi_end_h = round(bottom_rois[4] * spatial_scale_); assert(roi_batch_ind >= 0); assert(static_cast(roi_batch_ind) < data.size(0) /* batch size */); @@ -83,18 +74,12 @@ inline void ROIPoolForward(const Tensor &out, const Dtype* batch_data = bottom_data + data_size * roi_batch_ind; - #pragma omp parallel for for (int c = 0; c < channels_; ++c) { - // Increment all data pointers - const Dtype* batch_data_c = batch_data + c * data_size_c; - Dtype* top_data_c = top_data_n + c * out_size_c; - Dtype* argmax_data_c = argmax_data_n + c * max_idx_size_c; - for (int ph = 0; ph < pooled_height_; ++ph) { for (int pw = 0; pw < pooled_width_; ++pw) { // Compute pooling region for this output unit: - // start (included) = floor(ph * roi_height / pooled_height_) - // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) + // start (included) = floor(ph * roi_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) @@ -113,23 +98,30 @@ inline void ROIPoolForward(const Tensor &out, const int pool_index = ph * pooled_width_ + pw; if (is_empty) { - top_data_c[pool_index] = 0; - argmax_data_c[pool_index] = -1; + top_data[pool_index] = 0; + argmax_data[pool_index] = -1; } for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width_ + w; - if (batch_data_c[index] > top_data_c[pool_index]) { - top_data_c[pool_index] = batch_data_c[index]; - argmax_data_c[pool_index] = index; + if (batch_data[index] > top_data[pool_index]) { + top_data[pool_index] = batch_data[index]; + argmax_data[pool_index] = index; } } } } } + // Increment all data pointers by one channel + batch_data += data.size(2) * data.size(3); + top_data += out.size(2) * out.size(3); + argmax_data += max_idx.size(2) * max_idx.size(3); } + // Increment ROI data pointer + bottom_rois += bbox.size(1); } + return; } From 1863d91861fa435cc17d018adafcd14051ea78b8 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Mon, 12 Mar 2018 09:46:31 -0700 Subject: [PATCH 10/20] pylint disable --- python/mxnet/text/__init__.py | 1 + python/mxnet/text/embedding.py | 1 + python/mxnet/text/vocab.py | 11 ++++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/python/mxnet/text/__init__.py b/python/mxnet/text/__init__.py index 44e1ad980bc5..3c33272af404 100644 --- a/python/mxnet/text/__init__.py +++ b/python/mxnet/text/__init__.py @@ -16,6 +16,7 @@ # under the License. # coding: utf-8 +# pylint: disable=wildcard-import """This module includes utilities for indexing and embedding text.""" from .vocab import * diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py index 482dbe3c2c5b..2c94712bbf11 100644 --- a/python/mxnet/text/embedding.py +++ b/python/mxnet/text/embedding.py @@ -18,6 +18,7 @@ # coding: utf-8 # pylint: disable=consider-iterating-dictionary # pylint: disable=super-init-not-called +# pylint: disable=arguments-differ """Text token embedding.""" from __future__ import absolute_import diff --git a/python/mxnet/text/vocab.py b/python/mxnet/text/vocab.py index 8011521ff491..04ae6a787666 100644 --- a/python/mxnet/text/vocab.py +++ b/python/mxnet/text/vocab.py @@ -66,7 +66,7 @@ class Vocabulary(object): Properties ---------- - embedding : instance of `~mxnet.text.embedding.TokenEmbedding` + embedding : instance of :class:`~mxnet.text.embedding.TokenEmbedding` The embedding of the indexed tokens. idx_to_token : list of strs A list of indexed tokens where the list indices and the token indices are aligned. @@ -209,6 +209,15 @@ def __len__(self): return len(self._idx_to_token) def set_embedding(self, embeddings): + """Assigns embeddings to the indexed text tokens. + + + Parameters + ---------- + embeddings : instance or list of instances of :class:`~mxnet.text.embedding.TokenEmbedding` + The embedding to be assigned to the indexed tokens. If a list of multiple embeddings are + provided, their embedding vectors will be concatenated for the same token. + """ if not isinstance(embeddings, (list, tuple)): embeddings = [embeddings] From c378669084381e29bececbdac895df9a782f56a4 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Mon, 12 Mar 2018 10:30:52 -0700 Subject: [PATCH 11/20] Add API doc --- docs/api/python/text/text.md | 397 +++++++++++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 docs/api/python/text/text.md diff --git a/docs/api/python/text/text.md b/docs/api/python/text/text.md new file mode 100644 index 000000000000..26a460260c09 --- /dev/null +++ b/docs/api/python/text/text.md @@ -0,0 +1,397 @@ +# Text API + +## Overview + +The `mxnet.text` APIs refer to classes and functions related to text data processing, such +as bulding indices and loading pre-trained embedding vectors for text tokens and storing them in the +`mxnet.ndarray.NDArray` format. + +This document lists the text APIs in mxnet: + +```eval_rst +.. autosummary:: + :nosignatures: + + mxnet.text.embedding + mxnet.text.vocab + mxnet.text.utils +``` + +All the code demonstrated in this document assumes that the following modules or packages are +imported. + +```python +>>> from mxnet import gluon +>>> from mxnet import nd +>>> from mxnet import text +>>> import collections + +``` + +### Access pre-trained word embeddings for indexed words + +As a common use case, let us access pre-trained word embedding vectors for indexed words in just a +few lines of code. + +To begin with, let us create a fastText word embedding instance by specifying the embedding name +`fasttext` and the pre-trained file `wiki.simple.vec`. + +```python +>>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + +``` + +Now, suppose that we have a simple text data set in the string format. We can count +word frequency in the data set. + +```python +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.utils.count_tokens_from_str(text_data) + +``` + +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +Suppose that we want to build indices for all the keys in `counter` and load the defined fastText +word embedding for all such indexed words. We need a Vocabulary object with `counter` and +`embedding` as its arguments + +```python +>>> my_vocab = text.vocab.Vocabulary(counter, embedding=fasttext) + +``` + +Now we are ready to access the fastText word embedding vectors for indexed words, such as 'hello' +and 'world'. + +```python +>>> my_vocab.embedding[['hello', 'world']] + +[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + +``` + +### Using pre-trained word embeddings in `gluon` + +To demonstrate how to use pre-trained word embeddings in the `gluon` package, let us first obtain +indices of the words 'hello' and 'world'. + +```python +>>> my_vocab[['hello', 'world']] +[2, 1] + +``` + +We can obtain the vector representation for the words 'hello' and 'world' by specifying their +indices (2 and 1) and the `my_embedding.idx_to_vec` in `mxnet.gluon.nn.Embedding`. + +```python +>>> layer = gluon.nn.Embedding(my_vocab.embedding.idx_to_vec.shape[0], my_vocab.embedding.idx_to_vec.shape[1]) +>>> layer.initialize() +>>> layer.weight.set_data(my_vocab.embedding.idx_to_vec) +>>> layer(nd.array([2, 1])) + +[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + +``` + +## Vocabulary + +The vocabulary builds indices for text tokens and can be assigned with token embeddings. The input +counter whose keys are candidate indices may be obtained via +[`count_tokens_from_str`](#mxnet.text.utils.count_tokens_from_str). + + +```eval_rst +.. currentmodule:: mxnet.text.vocab +.. autosummary:: + :nosignatures: + + Vocabulary +``` + +Suppose that we have a simple text data set in the string format. We can count word frequency in the +data set. + +```python +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.utils.count_tokens_from_str(text_data) + +``` + +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +Suppose that we want to build indices for the 2 most frequent keys in `counter` with the unknown +token representation '<unk>' and a reserved token '<pad>'. + +```python +>>> my_vocab = text.vocab.Vocabulary(counter, max_size=2, unknown_token='(unk)', +... reserved_tokens=['(pad)']) + +``` + +We can access properties such as `token_to_idx` (mapping tokens to indices), `idx_to_token` (mapping +indices to tokens), `unknown_token` (representation of any unknown token) and `reserved_tokens`. + + +```python +>>> my_vocab.token_to_idx +{'(unk)': 0, '(pad)': 1, 'world': 2, 'hello': 3} +>>> my_vocab.idx_to_token +['(unk)', '(pad)', 'world', 'hello'] +>>> my_vocab.unknown_token +'(unk)' +>>> my_vocab.reserved_tokens +['(pad)'] +>>> len(my_vocab) +4 +>>> my_vocab[['hello', 'world']] +[3, 2] +``` + +Besides the specified unknown token '(unk)' and reserved_token '(pad)' are indexed, the 2 most +frequent words 'world' and 'hello' are also indexed. + + +### Assign token embedding to vocabulary + +A vocabulary instance can be assigned with token embedding. + +To begin with, suppose that we have a simple text data set in the string format. We can count word +frequency in the data set. + +```python +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.utils.count_tokens_from_str(text_data) + +``` + +Let us define the fastText word embedding with pre-trained file `wiki.simple.vec`. + +```python +>>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + +``` + +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +Suppose that we want to build indices for the most frequent 2 keys in `counter` and load the defined +fastText word embedding for all these 2 words. + +```python +>>> my_vocab = text.vocab.Vocabulary(counter, max_size=2, embedding=fasttext) + +``` + +Now we are ready to access the fastText word embedding vectors for indexed words. + +```python +>>> my_vocab.embedding[['hello', 'world']] + +[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + +``` + +Let us define the GloVe word embedding with pre-trained file `glove.6B.50d.txt`. Then, +we can re-assign a GloVe text embedding to the vocabulary. + +```python +>>> glove = text.embedding.create('glove', file_name='glove.6B.50d.txt') +>>> my_vocab.set_embedding(glove) + +``` + +Now we are ready to access the GloVe word embedding vectors for indexed words. + +```python +>>> my_vocab.embedding[['hello', 'world']] + +[[ -0.38497001 0.80092001 + ... + 0.048833 0.67203999] + [ -0.41486001 0.71847999 + ... + -0.37639001 -0.67541999]] + + +``` + +If a token is unknown to `glossary`, its embedding vector is initialized according to the default +specification in `fasttext_simple` (all elements are 0). + +```python + +>>> my_embedding.get_vecs_by_tokens('nice') + +[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. + ... + 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] + + +``` + + + +## Text token embedding + +To load token embeddings from an externally hosted pre-trained token embedding file, such as those +of GloVe and FastText, use +[`embedding.create(embedding_name, file_name)`](#mxnet.text.embedding.create). + +To get all the available `embedding_name` and `pretrained_file_name`, use +[`embedding.get_file_names()`](#mxnet.text.embedding.get_file_names). + +```python +>>> text.embedding.get_file_names() +{'glove': ['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', ...], +'fasttext': ['wiki.en.vec', 'wiki.simple.vec', 'wiki.zh.vec', ...]} + +``` + +Alternatively, to load embedding vectors from a custom pre-trained text token +embedding file, use [`TokenEmbedding.from_file`](#mxnet.text.embedding.TokenEmbedding.from_file). + + +```eval_rst +.. currentmodule:: mxnet.text.embedding +.. autosummary:: + :nosignatures: + + register + create + get_file_names + TokenEmbedding + GloVe + FastText +``` + + + + + +### Indexed tokens are from the loaded embedding vectors + +One can also use all the tokens from the loaded embedding vectors, such as loaded from a pre-trained +token embedding file, as the indexed tokens of the embedding. + +To begin with, we can create a fastText word embedding object by specifying the embedding name +'fasttext' and the pre-trained file 'wiki.simple.vec'. The argument `init_unknown_vec` specifies +default vector representation for any unknown token. To index all the tokens from this pre-trained +word embedding file, we do not need to specify any vocabulary. + +```python +>>> my_embedding = text.embedding.create('fasttext', pretrained_file_name='wiki.simple.vec', +... init_unknown_vec=nd.zeros) + +``` + +We can access properties such as `token_to_idx` (mapping tokens to indices), `idx_to_token` (mapping +indices to tokens), `vec_len` (length of each embedding vector), and `unknown_token` (representation +of any unknown token, default value is '<unk>'). + +```python +>>> my_embedding.token_to_idx['nice'] +2586 +>>> my_embedding.idx_to_token[2586] +'nice' +>>> my_embedding.vec_len +300 +>>> my_embedding.unknown_token +'<unk>' + +``` + +For every unknown token, if its representation '<unk>' is encountered in the pre-trained token +embedding file, index 0 of property `idx_to_vec` maps to the pre-trained token embedding vector +loaded from the file; otherwise, index 0 of property `idx_to_vec` maps to the default token +embedding vector specified via `init_unknown_vec` (set to nd.zeros here). Since the pre-trained file +does not have a vector for the token '<unk>', index 0 has to map to an additional token '<unk>' and +the number of tokens in the embedding is 111,052. + + +```python +>>> len(my_embedding) +111052 +>>> my_embedding.idx_to_vec[0] + +[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. + ... + 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] + +>>> my_embedding.get_vecs_by_tokens('nice') + +[ 0.49397001 0.39996001 0.24000999 -0.15121 -0.087512 0.37114 + ... + 0.089521 0.29175001 -0.40917999 -0.089206 -0.1816 -0.36616999] + +>>> my_embedding.get_vecs_by_tokens(['unknownT0kEN', 'unknownT0kEN']) + +[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. + ... + 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] + [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. + ... + 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] + + +``` + + +### Implement a new text token embedding + +For ``optimizer``, create a subclass of `mxnet.text.embedding.TokenEmbedding`. +Also add ``@mxnet.text.embedding.TokenEmbedding.register`` before this class. See +[`embedding.py`](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/text/embedding.py) +for examples. + + +## Text utilities + +The following functions provide utilities for text data processing. + +```eval_rst +.. currentmodule:: mxnet.text.utils +.. autosummary:: + :nosignatures: + + count_tokens_from_str +``` + + +## API Reference + + + +```eval_rst + +.. automodule:: mxnet.text.embedding + :members: register, create, get_file_names +.. autoclass:: mxnet.text.embedding.TokenEmbedding + :members: from_file +.. autoclass:: mxnet.text.embedding.GloVe +.. autoclass:: mxnet.text.embedding.FastText + +.. automodule:: mxnet.text.vocab +.. autoclass:: mxnet.text.vocab.Vocabulary + :members: set_embedding, to_tokens + +.. automodule:: mxnet.text.utils + :members: count_tokens_from_str + +``` + \ No newline at end of file From 5edca9d2a8ea112cb600c8c4a28f751f9e38e420 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Mon, 12 Mar 2018 17:27:45 -0700 Subject: [PATCH 12/20] less pylint disable --- python/mxnet/text/embedding.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py index 2c94712bbf11..244a82060909 100644 --- a/python/mxnet/text/embedding.py +++ b/python/mxnet/text/embedding.py @@ -17,8 +17,6 @@ # coding: utf-8 # pylint: disable=consider-iterating-dictionary -# pylint: disable=super-init-not-called -# pylint: disable=arguments-differ """Text token embedding.""" from __future__ import absolute_import @@ -171,8 +169,8 @@ def __init__(self, unknown_token=''): self._idx_to_vec = None @classmethod - def _get_download_file_name(cls, pretrained_file_name): - return pretrained_file_name + def _get_download_file_name(cls, file_name): + return file_name @classmethod def _get_pretrained_file_url(cls, pretrained_file_name): From c208477450e213b8921147d701aeca23f3b0ab21 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Tue, 13 Mar 2018 11:49:50 -0700 Subject: [PATCH 13/20] remove contrib --- python/mxnet/text/embedding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/mxnet/text/embedding.py b/python/mxnet/text/embedding.py index 244a82060909..3ac038a663aa 100644 --- a/python/mxnet/text/embedding.py +++ b/python/mxnet/text/embedding.py @@ -45,11 +45,11 @@ def register(embedding_cls): Examples -------- - >>> @mxnet.contrib.text.embedding.register - ... class MyTextEmbed(mxnet.contrib.text.embedding.TokenEmbedding): + >>> @mxnet.text.embedding.register + ... class MyTextEmbed(mxnet.text.embedding.TokenEmbedding): ... def __init__(self, pretrained_file_name='my_pretrain_file'): ... pass - >>> embed = mxnet.contrib.text.embedding.create('MyTokenEmbed') + >>> embed = mxnet.text.embedding.create('MyTokenEmbed') >>> print(type(embed)) """ From 56d5307bb0e2bb486c3e96738b1991b38780dd15 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Tue, 13 Mar 2018 16:58:41 -0700 Subject: [PATCH 14/20] move to gluon, revise api doc --- docs/api/python/{text => gluon}/text.md | 169 ++++++------------ python/mxnet/{ => gluon}/text/__init__.py | 0 python/mxnet/{ => gluon}/text/_constants.py | 0 python/mxnet/{ => gluon}/text/embedding.py | 39 ++-- python/mxnet/{ => gluon}/text/utils.py | 0 python/mxnet/{ => gluon}/text/vocab.py | 52 +++++- .../{test_text.py => test_gluon_text.py} | 6 +- 7 files changed, 123 insertions(+), 143 deletions(-) rename docs/api/python/{text => gluon}/text.md (58%) rename python/mxnet/{ => gluon}/text/__init__.py (100%) rename python/mxnet/{ => gluon}/text/_constants.py (100%) rename python/mxnet/{ => gluon}/text/embedding.py (94%) rename python/mxnet/{ => gluon}/text/utils.py (100%) rename python/mxnet/{ => gluon}/text/vocab.py (84%) rename tests/python/unittest/{test_text.py => test_gluon_text.py} (99%) diff --git a/docs/api/python/text/text.md b/docs/api/python/gluon/text.md similarity index 58% rename from docs/api/python/text/text.md rename to docs/api/python/gluon/text.md index 26a460260c09..6268c049d141 100644 --- a/docs/api/python/text/text.md +++ b/docs/api/python/gluon/text.md @@ -1,20 +1,20 @@ -# Text API +# Gluon Text API ## Overview -The `mxnet.text` APIs refer to classes and functions related to text data processing, such +The `mxnet.gluon.text` APIs refer to classes and functions related to text data processing, such as bulding indices and loading pre-trained embedding vectors for text tokens and storing them in the `mxnet.ndarray.NDArray` format. -This document lists the text APIs in mxnet: +This document lists the text APIs in mxnet.gluon: ```eval_rst .. autosummary:: :nosignatures: - mxnet.text.embedding - mxnet.text.vocab - mxnet.text.utils + mxnet.gluon.text.embedding + mxnet.gluon.text.vocab + mxnet.gluon.text.utils ``` All the code demonstrated in this document assumes that the following modules or packages are @@ -23,7 +23,7 @@ imported. ```python >>> from mxnet import gluon >>> from mxnet import nd ->>> from mxnet import text +>>> from mxnet.gluon import text >>> import collections ``` @@ -34,29 +34,29 @@ As a common use case, let us access pre-trained word embedding vectors for index few lines of code. To begin with, let us create a fastText word embedding instance by specifying the embedding name -`fasttext` and the pre-trained file `wiki.simple.vec`. +`fasttext` and the pre-trained file name `wiki.simple.vec`. ```python >>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') ``` -Now, suppose that we have a simple text data set in the string format. We can count -word frequency in the data set. +Now, suppose that we have a simple text data set in the string format. We can count word frequency +in the data set. ```python >>> text_data = " hello world \n hello nice world \n hi world \n" ->>> counter = text.utils.count_tokens_from_str(text_data) +>>> counter = text.count_tokens_from_str(text_data) ``` The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. Suppose that we want to build indices for all the keys in `counter` and load the defined fastText -word embedding for all such indexed words. We need a Vocabulary object with `counter` and -`embedding` as its arguments +word embedding for all such indexed words. We need a Vocabulary instance with `counter` and +`fasttext` as its arguments. ```python ->>> my_vocab = text.vocab.Vocabulary(counter, embedding=fasttext) +>>> my_vocab = text.Vocabulary(counter, embedding=fasttext) ``` @@ -88,10 +88,12 @@ indices of the words 'hello' and 'world'. ``` We can obtain the vector representation for the words 'hello' and 'world' by specifying their -indices (2 and 1) and the `my_embedding.idx_to_vec` in `mxnet.gluon.nn.Embedding`. +indices (2 and 1) and the weight matrix `my_vocab.embedding.idx_to_vec` in +`mxnet.gluon.nn.Embedding`. ```python ->>> layer = gluon.nn.Embedding(my_vocab.embedding.idx_to_vec.shape[0], my_vocab.embedding.idx_to_vec.shape[1]) +>>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape +>>> layer = gluon.nn.Embedding(input_dim, output_dim) >>> layer.initialize() >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec) >>> layer(nd.array([2, 1])) @@ -110,11 +112,11 @@ indices (2 and 1) and the `my_embedding.idx_to_vec` in `mxnet.gluon.nn.Embedding The vocabulary builds indices for text tokens and can be assigned with token embeddings. The input counter whose keys are candidate indices may be obtained via -[`count_tokens_from_str`](#mxnet.text.utils.count_tokens_from_str). +[`count_tokens_from_str`](#mxnet.gluon.text.utils.count_tokens_from_str). ```eval_rst -.. currentmodule:: mxnet.text.vocab +.. currentmodule:: mxnet.gluon.text.vocab .. autosummary:: :nosignatures: @@ -132,16 +134,17 @@ data set. The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. Suppose that we want to build indices for the 2 most frequent keys in `counter` with the unknown -token representation '<unk>' and a reserved token '<pad>'. +token representation '(unk)' and a reserved token '(pad)'. ```python ->>> my_vocab = text.vocab.Vocabulary(counter, max_size=2, unknown_token='(unk)', +>>> my_vocab = text.Vocabulary(counter, max_size=2, unknown_token='(unk)', ... reserved_tokens=['(pad)']) ``` We can access properties such as `token_to_idx` (mapping tokens to indices), `idx_to_token` (mapping -indices to tokens), `unknown_token` (representation of any unknown token) and `reserved_tokens`. +indices to tokens), `unknown_token` (representation of any unknown token) and `reserved_tokens` +(reserved tokens). ```python @@ -172,18 +175,18 @@ frequency in the data set. ```python >>> text_data = " hello world \n hello nice world \n hi world \n" ->>> counter = text.utils.count_tokens_from_str(text_data) +>>> counter = text.count_tokens_from_str(text_data) ``` -Let us define the fastText word embedding with pre-trained file `wiki.simple.vec`. +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +Let us define the fastText word embedding instance with the pre-trained file `wiki.simple.vec`. ```python >>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') ``` -The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. Suppose that we want to build indices for the most frequent 2 keys in `counter` and load the defined fastText word embedding for all these 2 words. @@ -207,8 +210,8 @@ Now we are ready to access the fastText word embedding vectors for indexed words ``` -Let us define the GloVe word embedding with pre-trained file `glove.6B.50d.txt`. Then, -we can re-assign a GloVe text embedding to the vocabulary. +Let us define the GloVe word embedding with the pre-trained file `glove.6B.50d.txt`. Then, +we can re-assign a GloVe text embedding instance to the vocabulary. ```python >>> glove = text.embedding.create('glove', file_name='glove.6B.50d.txt') @@ -231,17 +234,17 @@ Now we are ready to access the GloVe word embedding vectors for indexed words. ``` -If a token is unknown to `glossary`, its embedding vector is initialized according to the default -specification in `fasttext_simple` (all elements are 0). +If a token is unknown to `my_vocab`, its embedding vector is initialized according to the default +specification in `glove` (all elements are 0). ```python ->>> my_embedding.get_vecs_by_tokens('nice') +>>> my_vocab.embedding['nice'] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - + ``` @@ -251,10 +254,10 @@ specification in `fasttext_simple` (all elements are 0). To load token embeddings from an externally hosted pre-trained token embedding file, such as those of GloVe and FastText, use -[`embedding.create(embedding_name, file_name)`](#mxnet.text.embedding.create). +[`embedding.create(embedding_name, file_name)`](#mxnet.gluon.text.embedding.create). -To get all the available `embedding_name` and `pretrained_file_name`, use -[`embedding.get_file_names()`](#mxnet.text.embedding.get_file_names). +To get all the available `embedding_name` and `file_name`, use +[`embedding.get_file_names()`](#mxnet.gluon.text.embedding.get_file_names). ```python >>> text.embedding.get_file_names() @@ -263,12 +266,12 @@ To get all the available `embedding_name` and `pretrained_file_name`, use ``` -Alternatively, to load embedding vectors from a custom pre-trained text token -embedding file, use [`TokenEmbedding.from_file`](#mxnet.text.embedding.TokenEmbedding.from_file). +Alternatively, to load embedding vectors from a custom pre-trained text token embedding file, use +[`TokenEmbedding.from_file`](#mxnet.gluon.text.embedding.TokenEmbedding.from_file). ```eval_rst -.. currentmodule:: mxnet.text.embedding +.. currentmodule:: mxnet.gluon.text.embedding .. autosummary:: :nosignatures: @@ -280,83 +283,15 @@ embedding file, use [`TokenEmbedding.from_file`](#mxnet.text.embedding.TokenEmbe FastText ``` - - - - -### Indexed tokens are from the loaded embedding vectors - -One can also use all the tokens from the loaded embedding vectors, such as loaded from a pre-trained -token embedding file, as the indexed tokens of the embedding. - -To begin with, we can create a fastText word embedding object by specifying the embedding name -'fasttext' and the pre-trained file 'wiki.simple.vec'. The argument `init_unknown_vec` specifies -default vector representation for any unknown token. To index all the tokens from this pre-trained -word embedding file, we do not need to specify any vocabulary. - -```python ->>> my_embedding = text.embedding.create('fasttext', pretrained_file_name='wiki.simple.vec', -... init_unknown_vec=nd.zeros) - -``` - -We can access properties such as `token_to_idx` (mapping tokens to indices), `idx_to_token` (mapping -indices to tokens), `vec_len` (length of each embedding vector), and `unknown_token` (representation -of any unknown token, default value is '<unk>'). - -```python ->>> my_embedding.token_to_idx['nice'] -2586 ->>> my_embedding.idx_to_token[2586] -'nice' ->>> my_embedding.vec_len -300 ->>> my_embedding.unknown_token -'<unk>' - -``` - -For every unknown token, if its representation '<unk>' is encountered in the pre-trained token -embedding file, index 0 of property `idx_to_vec` maps to the pre-trained token embedding vector -loaded from the file; otherwise, index 0 of property `idx_to_vec` maps to the default token -embedding vector specified via `init_unknown_vec` (set to nd.zeros here). Since the pre-trained file -does not have a vector for the token '<unk>', index 0 has to map to an additional token '<unk>' and -the number of tokens in the embedding is 111,052. - - -```python ->>> len(my_embedding) -111052 ->>> my_embedding.idx_to_vec[0] - -[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - ->>> my_embedding.get_vecs_by_tokens('nice') - -[ 0.49397001 0.39996001 0.24000999 -0.15121 -0.087512 0.37114 - ... - 0.089521 0.29175001 -0.40917999 -0.089206 -0.1816 -0.36616999] - ->>> my_embedding.get_vecs_by_tokens(['unknownT0kEN', 'unknownT0kEN']) - -[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] - [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. - ... - 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] - - -``` +See [Assign token embedding to vocabulary](#Assign token embedding to vocabulary) for how to assign +token embeddings to vocabulary and use token embeddings. ### Implement a new text token embedding -For ``optimizer``, create a subclass of `mxnet.text.embedding.TokenEmbedding`. -Also add ``@mxnet.text.embedding.TokenEmbedding.register`` before this class. See -[`embedding.py`](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/text/embedding.py) +For ``embedding``, create a subclass of `mxnet.gluon.text.embedding.TokenEmbedding`. +Also add ``@mxnet.gluon.text.embedding.TokenEmbedding.register`` before this class. See +[`embedding.py`](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/text/embedding.py) for examples. @@ -365,7 +300,7 @@ for examples. The following functions provide utilities for text data processing. ```eval_rst -.. currentmodule:: mxnet.text.utils +.. currentmodule:: mxnet.gluon.text.utils .. autosummary:: :nosignatures: @@ -379,18 +314,18 @@ The following functions provide utilities for text data processing. ```eval_rst -.. automodule:: mxnet.text.embedding +.. automodule:: mxnet.gluon.text.embedding :members: register, create, get_file_names -.. autoclass:: mxnet.text.embedding.TokenEmbedding +.. autoclass:: mxnet.gluon.text.embedding.TokenEmbedding :members: from_file -.. autoclass:: mxnet.text.embedding.GloVe -.. autoclass:: mxnet.text.embedding.FastText +.. autoclass:: mxnet.gluon.text.embedding.GloVe +.. autoclass:: mxnet.gluon.text.embedding.FastText -.. automodule:: mxnet.text.vocab -.. autoclass:: mxnet.text.vocab.Vocabulary +.. automodule:: mxnet.gluon.text.vocab +.. autoclass:: mxnet.gluon.text.vocab.Vocabulary :members: set_embedding, to_tokens -.. automodule:: mxnet.text.utils +.. automodule:: mxnet.gluon.text.utils :members: count_tokens_from_str ``` diff --git a/python/mxnet/text/__init__.py b/python/mxnet/gluon/text/__init__.py similarity index 100% rename from python/mxnet/text/__init__.py rename to python/mxnet/gluon/text/__init__.py diff --git a/python/mxnet/text/_constants.py b/python/mxnet/gluon/text/_constants.py similarity index 100% rename from python/mxnet/text/_constants.py rename to python/mxnet/gluon/text/_constants.py diff --git a/python/mxnet/text/embedding.py b/python/mxnet/gluon/text/embedding.py similarity index 94% rename from python/mxnet/text/embedding.py rename to python/mxnet/gluon/text/embedding.py index 3ac038a663aa..f7441ba2931b 100644 --- a/python/mxnet/text/embedding.py +++ b/python/mxnet/gluon/text/embedding.py @@ -30,9 +30,9 @@ import zipfile from . import _constants as C -from .. import ndarray as nd -from .. import registry -from ..gluon.utils import check_sha1, download, _get_repo_file_url +from mxnet import ndarray as nd +from mxnet import registry +from mxnet.gluon.utils import check_sha1, download, _get_repo_file_url def register(embedding_cls): @@ -40,16 +40,16 @@ def register(embedding_cls): Once an embedding is registered, we can create an instance of this embedding with - :func:`~mxnet.text.embedding.create`. + :func:`~mxnet.gluon.text.embedding.create`. Examples -------- - >>> @mxnet.text.embedding.register - ... class MyTextEmbed(mxnet.text.embedding.TokenEmbedding): - ... def __init__(self, pretrained_file_name='my_pretrain_file'): + >>> @mxnet.gluon.text.embedding.register + ... class MyTextEmbed(mxnet.gluon.text.embedding.TokenEmbedding): + ... def __init__(self, file_name='my_pretrain_file'): ... pass - >>> embed = mxnet.text.embedding.create('MyTokenEmbed') + >>> embed = mxnet.gluon.text.embedding.create('MyTokenEmbed') >>> print(type(embed)) """ @@ -64,7 +64,7 @@ def create(embedding_name, **kwargs): Creates a token embedding instance by loading embedding vectors from an externally hosted pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid - `embedding_name` and `file_name`, use `mxnet.text.embedding.get_file_names()`. + `embedding_name` and `file_name`, use `mxnet.gluon.text.embedding.get_file_names()`. Parameters @@ -75,7 +75,7 @@ def create(embedding_name, **kwargs): Returns ------- - An instance of `mxnet.text.embedding.TokenEmbedding`: + An instance of `mxnet.gluon.text.embedding.TokenEmbedding`: A token embedding instance that loads embedding vectors from an externally hosted pre-trained token embedding file. """ @@ -89,10 +89,11 @@ def get_file_names(embedding_name=None): To load token embedding vectors from an externally hosted pre-trained token embedding file, - such as those of GloVe and FastText, one should use `mxnet.text.embedding.create(embedding_name, - file_name)`. This method returns all the valid names of `file_name` for the specified - `embedding_name`. If `embedding_name` is set to None, this method returns all the valid - names of `embedding_name` with their associated `file_name`. + such as those of GloVe and FastText, one should use + `mxnet.gluon.text.embedding.create(embedding_name, file_name)`. This method returns all the + valid names of `file_name` for the specified `embedding_name`. If `embedding_name` is set to + None, this method returns all the valid names of `embedding_name` with their associated + `file_name`. Parameters @@ -108,7 +109,7 @@ def get_file_names(embedding_name=None): for the specified token embedding name (`embedding_name`). If the text embeding name is set to None, returns a dict mapping each valid token embedding name to a list of valid pre-trained files (`file_name`). They can be plugged into - `mxnet.text.embedding.create(embedding_name, file_name)`. + `mxnet.gluon.text.embedding.create(embedding_name, file_name)`. """ text_embedding_reg = registry.get_registry(TokenEmbedding) @@ -129,12 +130,12 @@ class TokenEmbedding(object): To load token embedding from an externally hosted pre-trained token embedding file, such as - those of GloVe and FastText, use :func:`~mxnet.text.embedding.create(embedding_name, + those of GloVe and FastText, use :func:`~mxnet.gluon.text.embedding.create(embedding_name, file_name)`. To get all the available `embedding_name` and `file_name`, use - :func:`~mxnet.text.embedding.get_file_names()`. + :func:`~mxnet.gluon.text.embedding.get_file_names()`. Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use - :func:`~mxnet.text.embedding.from_file()`. + :func:`~mxnet.gluon.text.embedding.from_file()`. For every unknown token, if its representation `self.unknown_token` is encountered in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token @@ -416,7 +417,7 @@ def from_file(file_path, elem_delim=' ', encoding='utf8', init_unknown_vec=nd.ze Returns ------- - instance of `~mxnet.text.embedding.TokenEmbedding` + instance of `~mxnet.gluon.text.embedding.TokenEmbedding` The user-defined token embedding instance. """ embedding = TokenEmbedding(**kwargs) diff --git a/python/mxnet/text/utils.py b/python/mxnet/gluon/text/utils.py similarity index 100% rename from python/mxnet/text/utils.py rename to python/mxnet/gluon/text/utils.py diff --git a/python/mxnet/text/vocab.py b/python/mxnet/gluon/text/vocab.py similarity index 84% rename from python/mxnet/text/vocab.py rename to python/mxnet/gluon/text/vocab.py index 04ae6a787666..485455f95324 100644 --- a/python/mxnet/text/vocab.py +++ b/python/mxnet/gluon/text/vocab.py @@ -26,7 +26,7 @@ from . import _constants as C from .embedding import TokenEmbedding -from .. import nd +from mxnet import nd class Vocabulary(object): @@ -66,7 +66,7 @@ class Vocabulary(object): Properties ---------- - embedding : instance of :class:`~mxnet.text.embedding.TokenEmbedding` + embedding : instance of :class:`~mxnet.gluon.text.embedding.TokenEmbedding` The embedding of the indexed tokens. idx_to_token : list of strs A list of indexed tokens where the list indices and the token indices are aligned. @@ -77,6 +77,50 @@ class Vocabulary(object): unknown_token : hashable object The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. + + + Examples + -------- + >>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + >>> text_data = " hello world \n hello nice world \n hi world \n" + >>> counter = text.count_tokens_from_str(text_data) + >>> my_vocab = text.Vocabulary(counter, embedding=fasttext) + >>> my_vocab.embedding[['hello', 'world']] + [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + + >>> my_vocab[['hello', 'world']] + [2, 1] + + >>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape + >>> layer = gluon.nn.Embedding(input_dim, output_dim) + >>> layer.initialize() + >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec) + >>> layer(nd.array([2, 1])) + [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 + ... + -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02] + [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01 + ... + -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]] + + + >>> glove = text.embedding.create('glove', file_name='glove.6B.50d.txt') + >>> my_vocab.set_embedding(glove) + >>> my_vocab.embedding[['hello', 'world']] + [[ -0.38497001 0.80092001 + ... + 0.048833 0.67203999] + [ -0.41486001 0.71847999 + ... + -0.37639001 -0.67541999]] + + """ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token='', @@ -214,7 +258,7 @@ def set_embedding(self, embeddings): Parameters ---------- - embeddings : instance or list of instances of :class:`~mxnet.text.embedding.TokenEmbedding` + embeddings : :class:`~mxnet.gluon.text.embedding.TokenEmbedding` instance or instance list The embedding to be assigned to the indexed tokens. If a list of multiple embeddings are provided, their embedding vectors will be concatenated for the same token. """ @@ -225,7 +269,7 @@ def set_embedding(self, embeddings): for embedding in embeddings: assert isinstance(embedding, TokenEmbedding), \ 'The argument `embeddings` must be an instance or a list of instances of ' \ - '`mxnet.text.embedding.TokenEmbedding`.' + '`mxnet.gluon.text.embedding.TokenEmbedding`.' new_embedding = TokenEmbedding(self.unknown_token) new_embedding._token_to_idx = self.token_to_idx diff --git a/tests/python/unittest/test_text.py b/tests/python/unittest/test_gluon_text.py similarity index 99% rename from tests/python/unittest/test_text.py rename to tests/python/unittest/test_gluon_text.py index 00e5175488d5..3b59e5f425b6 100644 --- a/tests/python/unittest/test_text.py +++ b/tests/python/unittest/test_gluon_text.py @@ -24,8 +24,8 @@ from common import assertRaises from mxnet import ndarray as nd +from mxnet.gluon import text from mxnet.test_utils import * -from mxnet import text def _get_test_str_of_tokens(token_delim, seq_delim): @@ -89,8 +89,8 @@ def test_vocabulary_getitem(): def test_vocabulary_to_tokens(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) - vocab = text.Vocabulary(counter, max_size=None, min_freq=1, - unknown_token='', reserved_tokens=None) + vocab = text.Vocabulary(counter, max_size=None, min_freq=1,unknown_token='', + reserved_tokens=None) i1 = vocab.to_tokens(1) assert i1 == 'c' From 5ba222570f2d195efd3670002d88d001229d996e Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Tue, 13 Mar 2018 17:27:44 -0700 Subject: [PATCH 15/20] fix import order --- python/mxnet/gluon/text/embedding.py | 4 ++-- python/mxnet/gluon/text/vocab.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/mxnet/gluon/text/embedding.py b/python/mxnet/gluon/text/embedding.py index f7441ba2931b..1879f6113c73 100644 --- a/python/mxnet/gluon/text/embedding.py +++ b/python/mxnet/gluon/text/embedding.py @@ -29,10 +29,10 @@ import warnings import zipfile -from . import _constants as C -from mxnet import ndarray as nd +from mxnet import nd from mxnet import registry from mxnet.gluon.utils import check_sha1, download, _get_repo_file_url +from . import _constants as C def register(embedding_cls): diff --git a/python/mxnet/gluon/text/vocab.py b/python/mxnet/gluon/text/vocab.py index 485455f95324..2e141172d363 100644 --- a/python/mxnet/gluon/text/vocab.py +++ b/python/mxnet/gluon/text/vocab.py @@ -23,10 +23,10 @@ from __future__ import print_function import collections +from mxnet import nd from . import _constants as C from .embedding import TokenEmbedding -from mxnet import nd class Vocabulary(object): From 47d7ed455997b5009234a0806da8e3286cfafea8 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Tue, 13 Mar 2018 18:01:38 -0700 Subject: [PATCH 16/20] re-test --- python/mxnet/gluon/text/embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/mxnet/gluon/text/embedding.py b/python/mxnet/gluon/text/embedding.py index 1879f6113c73..fd5d6d741f1c 100644 --- a/python/mxnet/gluon/text/embedding.py +++ b/python/mxnet/gluon/text/embedding.py @@ -105,10 +105,10 @@ def get_file_names(embedding_name=None): Returns ------- dict or list: - A list of all the valid pre-trained token embedding file names (`file_name`) - for the specified token embedding name (`embedding_name`). If the text embeding name is - set to None, returns a dict mapping each valid token embedding name to a list of valid - pre-trained files (`file_name`). They can be plugged into + A list of all the valid pre-trained token embedding file names (`file_name`) for the + specified token embedding name (`embedding_name`). If the text embeding name is set to None, + returns a dict mapping each valid token embedding name to a list of valid pre-trained files + (`file_name`). They can be plugged into `mxnet.gluon.text.embedding.create(embedding_name, file_name)`. """ From 63923dbd7f3fc9d85116b7f87189720ea0bb1b5c Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Wed, 14 Mar 2018 07:13:55 -0700 Subject: [PATCH 17/20] relative imports --- python/mxnet/gluon/text/embedding.py | 6 +++--- python/mxnet/gluon/text/vocab.py | 8 ++++---- tests/python/unittest/test_gluon_text.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/mxnet/gluon/text/embedding.py b/python/mxnet/gluon/text/embedding.py index fd5d6d741f1c..4ff504ade6f7 100644 --- a/python/mxnet/gluon/text/embedding.py +++ b/python/mxnet/gluon/text/embedding.py @@ -29,10 +29,10 @@ import warnings import zipfile -from mxnet import nd -from mxnet import registry -from mxnet.gluon.utils import check_sha1, download, _get_repo_file_url from . import _constants as C +from ... import nd +from ... import registry +from ..utils import check_sha1, download, _get_repo_file_url def register(embedding_cls): diff --git a/python/mxnet/gluon/text/vocab.py b/python/mxnet/gluon/text/vocab.py index 2e141172d363..06124263aaee 100644 --- a/python/mxnet/gluon/text/vocab.py +++ b/python/mxnet/gluon/text/vocab.py @@ -23,10 +23,10 @@ from __future__ import print_function import collections -from mxnet import nd +from ... import nd from . import _constants as C -from .embedding import TokenEmbedding +from . import embedding as ebd class Vocabulary(object): @@ -267,11 +267,11 @@ def set_embedding(self, embeddings): embeddings = [embeddings] for embedding in embeddings: - assert isinstance(embedding, TokenEmbedding), \ + assert isinstance(embedding, ebd.TokenEmbedding), \ 'The argument `embeddings` must be an instance or a list of instances of ' \ '`mxnet.gluon.text.embedding.TokenEmbedding`.' - new_embedding = TokenEmbedding(self.unknown_token) + new_embedding = ebd.TokenEmbedding(self.unknown_token) new_embedding._token_to_idx = self.token_to_idx new_embedding._idx_to_token = self.idx_to_token diff --git a/tests/python/unittest/test_gluon_text.py b/tests/python/unittest/test_gluon_text.py index 3b59e5f425b6..a11f8377dd47 100644 --- a/tests/python/unittest/test_gluon_text.py +++ b/tests/python/unittest/test_gluon_text.py @@ -648,7 +648,7 @@ def test_vocabulary_with_two_custom_embeddings(): def test_download_embed(): @text.embedding.register - class Test(text.TokenEmbedding): + class Test(text.embedding.TokenEmbedding): # 33 bytes. pretrained_file_name_sha1 = \ {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'} From 616cff9599ea965e31b02feb4d248ab61a772f0d Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Wed, 14 Mar 2018 09:37:31 -0700 Subject: [PATCH 18/20] re-run test --- docs/api/python/gluon/text.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/python/gluon/text.md b/docs/api/python/gluon/text.md index 6268c049d141..0ce2a7e6305c 100644 --- a/docs/api/python/gluon/text.md +++ b/docs/api/python/gluon/text.md @@ -6,7 +6,7 @@ The `mxnet.gluon.text` APIs refer to classes and functions related to text data as bulding indices and loading pre-trained embedding vectors for text tokens and storing them in the `mxnet.ndarray.NDArray` format. -This document lists the text APIs in mxnet.gluon: +This document lists the text APIs in `mxnet.gluon`: ```eval_rst .. autosummary:: From 14735e1ad2317951e898786f3738ef49dc388a02 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Wed, 14 Mar 2018 18:58:44 -0700 Subject: [PATCH 19/20] revise implementation, test case, and api doc --- docs/api/python/gluon/text.md | 73 ++++++++++++++---------- python/mxnet/gluon/text/embedding.py | 11 ++-- python/mxnet/gluon/text/vocab.py | 44 +++++++------- tests/python/unittest/test_gluon_text.py | 18 ++++-- 4 files changed, 82 insertions(+), 64 deletions(-) diff --git a/docs/api/python/gluon/text.md b/docs/api/python/gluon/text.md index 0ce2a7e6305c..f9fcf3353563 100644 --- a/docs/api/python/gluon/text.md +++ b/docs/api/python/gluon/text.md @@ -28,35 +28,44 @@ imported. ``` -### Access pre-trained word embeddings for indexed words +### Indexing words and using pre-trained word embeddings in `gluon` -As a common use case, let us access pre-trained word embedding vectors for indexed words in just a -few lines of code. +As a common use case, let us index words, attach pre-trained word embeddings for them, and use +such embeddings in `gluon` in just a few lines of code. -To begin with, let us create a fastText word embedding instance by specifying the embedding name -`fasttext` and the pre-trained file name `wiki.simple.vec`. +To begin with, suppose that we have a simple text data set in the string format. We can count word +frequency in the data set. ```python ->>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') +>>> text_data = " hello world \n hello nice world \n hi world \n" +>>> counter = text.count_tokens_from_str(text_data) ``` -Now, suppose that we have a simple text data set in the string format. We can count word frequency -in the data set. +The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +This allows us to filter out infrequent words (See details at +[Vocabulary API specifications](#mxnet.gluon.text.vocab.Vocabulary)). +Suppose that we want to build indices for all the keys in `counter`. We need a Vocabulary instance +with `counter` as its argument. ```python ->>> text_data = " hello world \n hello nice world \n hi world \n" ->>> counter = text.count_tokens_from_str(text_data) +>>> my_vocab = text.Vocabulary(counter) ``` -The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. -Suppose that we want to build indices for all the keys in `counter` and load the defined fastText -word embedding for all such indexed words. We need a Vocabulary instance with `counter` and -`fasttext` as its arguments. +To attach word embedding to indexed words in `my_vocab`, let us go on to create a fastText word +embedding instance by specifying the embedding name `fasttext` and the pre-trained file name +`wiki.simple.vec`. ```python ->>> my_vocab = text.Vocabulary(counter, embedding=fasttext) +>>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + +``` + +So we can attach word embedding `fasttext` to indexed words `my_vocab`. + +```python +>>> my_vocab.set_embedding(fasttext) ``` @@ -76,8 +85,6 @@ and 'world'. ``` -### Using pre-trained word embeddings in `gluon` - To demonstrate how to use pre-trained word embeddings in the `gluon` package, let us first obtain indices of the words 'hello' and 'world'. @@ -110,7 +117,7 @@ indices (2 and 1) and the weight matrix `my_vocab.embedding.idx_to_vec` in ## Vocabulary -The vocabulary builds indices for text tokens and can be assigned with token embeddings. The input +The vocabulary builds indices for text tokens and can be attached with token embeddings. The input counter whose keys are candidate indices may be obtained via [`count_tokens_from_str`](#mxnet.gluon.text.utils.count_tokens_from_str). @@ -133,8 +140,9 @@ data set. ``` The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. -Suppose that we want to build indices for the 2 most frequent keys in `counter` with the unknown -token representation '(unk)' and a reserved token '(pad)'. +This allows us to filter out infrequent words. Suppose that we want to build indices for the 2 most +frequent keys in `counter` with the unknown token representation '(unk)' and a reserved token +'(pad)'. ```python >>> my_vocab = text.Vocabulary(counter, max_size=2, unknown_token='(unk)', @@ -166,9 +174,9 @@ Besides the specified unknown token '(unk)' and reserved_token '(pad)' are index frequent words 'world' and 'hello' are also indexed. -### Assign token embedding to vocabulary +### Attach token embedding to vocabulary -A vocabulary instance can be assigned with token embedding. +A vocabulary instance can be attached with token embedding. To begin with, suppose that we have a simple text data set in the string format. We can count word frequency in the data set. @@ -180,6 +188,14 @@ frequency in the data set. ``` The obtained `counter` has key-value pairs whose keys are words and values are word frequencies. +This allows us to filter out infrequent words. +Suppose that we want to build indices for the most frequent 2 keys in `counter`. + +```python +>>> my_vocab = text.Vocabulary(counter, max_size=2) + +``` + Let us define the fastText word embedding instance with the pre-trained file `wiki.simple.vec`. ```python @@ -187,15 +203,14 @@ Let us define the fastText word embedding instance with the pre-trained file `wi ``` -Suppose that we want to build indices for the most frequent 2 keys in `counter` and load the defined -fastText word embedding for all these 2 words. +So we can attach word embedding `fasttext` to indexed words `my_vocab`. ```python ->>> my_vocab = text.vocab.Vocabulary(counter, max_size=2, embedding=fasttext) +>>> my_vocab.set_embedding(fasttext) ``` -Now we are ready to access the fastText word embedding vectors for indexed words. +Now we are ready to access the fastText word embedding vectors for the indexed words. ```python >>> my_vocab.embedding[['hello', 'world']] @@ -211,7 +226,7 @@ Now we are ready to access the fastText word embedding vectors for indexed words ``` Let us define the GloVe word embedding with the pre-trained file `glove.6B.50d.txt`. Then, -we can re-assign a GloVe text embedding instance to the vocabulary. +we can re-attach a GloVe text embedding instance to the vocabulary. ```python >>> glove = text.embedding.create('glove', file_name='glove.6B.50d.txt') @@ -219,7 +234,7 @@ we can re-assign a GloVe text embedding instance to the vocabulary. ``` -Now we are ready to access the GloVe word embedding vectors for indexed words. +Now we are ready to access the GloVe word embedding vectors for the indexed words. ```python >>> my_vocab.embedding[['hello', 'world']] @@ -283,7 +298,7 @@ Alternatively, to load embedding vectors from a custom pre-trained text token em FastText ``` -See [Assign token embedding to vocabulary](#Assign token embedding to vocabulary) for how to assign +See [Assign token embedding to vocabulary](#assign-token-embedding-to-vocabulary) for how to attach token embeddings to vocabulary and use token embeddings. diff --git a/python/mxnet/gluon/text/embedding.py b/python/mxnet/gluon/text/embedding.py index 4ff504ade6f7..1839212ee825 100644 --- a/python/mxnet/gluon/text/embedding.py +++ b/python/mxnet/gluon/text/embedding.py @@ -392,14 +392,13 @@ def from_file(file_path, elem_delim=' ', encoding='utf8', init_unknown_vec=nd.ze This is to load embedding vectors from a user-defined pre-trained token embedding file. - Denote by '(ed)' the argument `elem_delim`. Denote by (v_ij) the j-th element of the token - embedding vector for (token_i), the expected format of a custom pre-trained token embedding - file is: + For example, if `elem_delim` = ' ', the expected format of a custom pre-trained token + embedding file may look like: - '(token_1)(ed))v_11)(ed)(v_12)(ed)...(ed)(v_1k)\\\\n - (token_2)(ed)(v_21)(ed)(v_22)(ed)...(ed)(v_2k)\\\\n...' + 'hello 0.1 0.2 0.3 0.4 0.5\\\\nworld 1.1 1.2 1.3 1.4 1.5\\\\n' - where k is the length of the embedding vector `vec_len`. + where embedding vectors of words `hello` and `world` are [0.1, 0.2, 0.3, 0.4, 0.5] and + [1.1, 1.2, 1.3, 1.4, 1.5] respectively. Parameters diff --git a/python/mxnet/gluon/text/vocab.py b/python/mxnet/gluon/text/vocab.py index 06124263aaee..841d517c8d42 100644 --- a/python/mxnet/gluon/text/vocab.py +++ b/python/mxnet/gluon/text/vocab.py @@ -30,7 +30,7 @@ class Vocabulary(object): - """Indexing and embedding assignment for text tokens. + """Indexing and embedding attachment for text tokens. Parameters @@ -55,13 +55,11 @@ class Vocabulary(object): as the same representation. Keys of `counter`, `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. reserved_tokens : list of hashable objects or None, default None - A list of reserved tokens that will always be indexed, such as special symbols representing - padding, beginning of sentence, and end of sentence. It cannot contain `unknown_token`, or - duplicate reserved tokens. Keys of `counter`, `unknown_token`, and values of - `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. - embedding : instance or list of instances of `embedding.TokenEmbedding`, default None - The embedding to be assigned to the indexed tokens. If a list of multiple embeddings are - provided, their embedding vectors will be concatenated for the same token. + A list of reserved tokens (excluding `unknown_token`) that will always be indexed, such as + special symbols representing padding, beginning of sentence, and end of sentence. It cannot + contain `unknown_token`, or duplicate reserved tokens. Keys of `counter`, `unknown_token`, + and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and + tuple. Properties @@ -81,10 +79,12 @@ class Vocabulary(object): Examples -------- - >>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') - >>> text_data = " hello world \n hello nice world \n hi world \n" + + >>> text_data = " hello world \\\\n hello nice world \\\\n hi world \\\\n" >>> counter = text.count_tokens_from_str(text_data) - >>> my_vocab = text.Vocabulary(counter, embedding=fasttext) + >>> my_vocab = text.Vocabulary(counter) + >>> fasttext = text.embedding.create('fasttext', file_name='wiki.simple.vec') + >>> my_vocab.set_embedding(fasttext) >>> my_vocab.embedding[['hello', 'world']] [[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01 ... @@ -124,7 +124,7 @@ class Vocabulary(object): """ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None, embedding=None): + reserved_tokens=None): # Sanity checks. assert min_freq > 0, '`min_freq` must be set to a positive value.' @@ -141,10 +141,7 @@ def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token=' if counter is not None: self._index_counter_keys(counter, unknown_token, reserved_tokens, max_size, min_freq) - if embedding is None: - self._embedding = None - else: - self.set_embedding(embedding) + self._embedding = None def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens): """Indexes unknown and reserved tokens.""" @@ -252,19 +249,20 @@ def __getitem__(self, tokens): def __len__(self): return len(self._idx_to_token) - def set_embedding(self, embeddings): - """Assigns embeddings to the indexed text tokens. + def set_embedding(self, *embeddings): + """Attaches embeddings to the indexed text tokens. Parameters ---------- - embeddings : :class:`~mxnet.gluon.text.embedding.TokenEmbedding` instance or instance list - The embedding to be assigned to the indexed tokens. If a list of multiple embeddings are - provided, their embedding vectors will be concatenated for the same token. + embeddings : None or tuple of :class:`~mxnet.gluon.text.embedding.TokenEmbedding` instances + The embedding to be attached to the indexed tokens. If a tuple of multiple embeddings + are provided, their embedding vectors will be concatenated for the same token. """ - if not isinstance(embeddings, (list, tuple)): - embeddings = [embeddings] + if len(embeddings) == 1 and embeddings[0] is None: + self._embedding = None + return for embedding in embeddings: assert isinstance(embedding, ebd.TokenEmbedding), \ diff --git a/tests/python/unittest/test_gluon_text.py b/tests/python/unittest/test_gluon_text.py index a11f8377dd47..ad4f4a036e09 100644 --- a/tests/python/unittest/test_gluon_text.py +++ b/tests/python/unittest/test_gluon_text.py @@ -511,8 +511,11 @@ def test_vocab_set_embedding_with_one_custom_embedding(): [1, 1, 1, 1, 1]]) ) + v1.set_embedding(None) + assert v1.embedding is None + -def test_vocabulary_with_two_custom_embeddings(): +def test_vocab_set_embedding_with_two_custom_embeddings(): embed_root = '.' embed_name = 'my_embed' elem_delim = '\t' @@ -532,9 +535,12 @@ def test_vocabulary_with_two_custom_embeddings(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) v1 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', - reserved_tokens=None, embedding=[my_embed1, my_embed2]) + reserved_tokens=None) + v1.set_embedding(my_embed1, my_embed2) assert v1.embedding is not None + assertRaises(AssertionError, v1.set_embedding, my_embed1, None, my_embed2) + assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(), np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1], @@ -581,7 +587,7 @@ def test_vocabulary_with_two_custom_embeddings(): v2 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', reserved_tokens=None) - v2.set_embedding([my_embed3, my_embed4]) + v2.set_embedding(my_embed3, my_embed4) assert_almost_equal(v2.embedding.idx_to_vec.asnumpy(), np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 0.11, 0.12, 0.13, 0.14, 0.15], @@ -597,7 +603,7 @@ def test_vocabulary_with_two_custom_embeddings(): v3 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', reserved_tokens=None) - v3.set_embedding([my_embed3, my_embed4]) + v3.set_embedding(my_embed3, my_embed4) assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(), np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 0.11, 0.12, 0.13, 0.14, 0.15], @@ -613,7 +619,7 @@ def test_vocabulary_with_two_custom_embeddings(): v4 = text.Vocabulary(counter, max_size=None, min_freq=1, unknown_token='', reserved_tokens=None) - v4.set_embedding([my_embed3, my_embed4]) + v4.set_embedding(my_embed3, my_embed4) assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(), np.array([[1.1, 1.2, 1.3, 1.4, 1.5, 0.11, 0.12, 0.13, 0.14, 0.15], @@ -631,7 +637,7 @@ def test_vocabulary_with_two_custom_embeddings(): v5 = text.Vocabulary(counter2, max_size=None, min_freq=1, unknown_token='a', reserved_tokens=None) - v5.set_embedding([my_embed3, my_embed4]) + v5.set_embedding(my_embed3, my_embed4) assert v5.embedding._token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3} assert v5.embedding._idx_to_token == ['a', 'c', 'b', 'some_word$'] assert_almost_equal(v5.embedding.idx_to_vec.asnumpy(), From 240ef86b634d1e851494185d5501933eff5cc311 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Thu, 15 Mar 2018 09:21:11 -0700 Subject: [PATCH 20/20] re-test --- python/mxnet/gluon/text/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mxnet/gluon/text/vocab.py b/python/mxnet/gluon/text/vocab.py index 841d517c8d42..aa962af24285 100644 --- a/python/mxnet/gluon/text/vocab.py +++ b/python/mxnet/gluon/text/vocab.py @@ -57,7 +57,7 @@ class Vocabulary(object): reserved_tokens : list of hashable objects or None, default None A list of reserved tokens (excluding `unknown_token`) that will always be indexed, such as special symbols representing padding, beginning of sentence, and end of sentence. It cannot - contain `unknown_token`, or duplicate reserved tokens. Keys of `counter`, `unknown_token`, + contain `unknown_token` or duplicate reserved tokens. Keys of `counter`, `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple.