From d76897b4368f1402672df4c1cc6becfca2df2402 Mon Sep 17 00:00:00 2001 From: phile Date: Tue, 28 Jul 2020 10:10:13 +0800 Subject: [PATCH 1/4] Add embedding related methods in numpy version (#1263) * A draft for embedding * fix embed_loader * add hyperbolic space and some updates * revise evaluation * fix * simple fixes * move l2norm to op.py * new features * fix * update * add tests, update * newline --- src/gluonnlp/__init__.py | 1 + src/gluonnlp/attention_cell.py | 19 +- src/gluonnlp/embedding/__init__.py | 24 + src/gluonnlp/embedding/_constants.py | 1002 ++++++++++++++++++++++++ src/gluonnlp/embedding/embed_loader.py | 320 ++++++++ src/gluonnlp/op.py | 19 + tests/test_embedding.py | 50 ++ 7 files changed, 1417 insertions(+), 18 deletions(-) create mode 100644 src/gluonnlp/embedding/__init__.py create mode 100644 src/gluonnlp/embedding/_constants.py create mode 100644 src/gluonnlp/embedding/embed_loader.py create mode 100644 tests/test_embedding.py diff --git a/src/gluonnlp/__init__.py b/src/gluonnlp/__init__.py index 8eb18ab075..31e7e08557 100644 --- a/src/gluonnlp/__init__.py +++ b/src/gluonnlp/__init__.py @@ -12,3 +12,4 @@ from . import optimizer from . import registry from . import sequence_sampler +from . import embedding diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py index 5b292f9823..c5288ae087 100644 --- a/src/gluonnlp/attention_cell.py +++ b/src/gluonnlp/attention_cell.py @@ -20,6 +20,7 @@ import mxnet as mx from mxnet.gluon.block import HybridBlock from mxnet.gluon import nn +from .op import l2_normalize from .layers import SinusoidalPositionalEmbedding,\ BucketPositionalEmbedding,\ LearnedPositionalEmbedding @@ -300,24 +301,6 @@ def masked_logsoftmax(F, att_score, mask, dtype=np.float32, axis: int = -1): return logits -def l2_normalize(F, data, axis=-1, eps=1E-6): - """Normalize the data by L2 normalization. - - Parameters - ---------- - F : mx.sym or mx.nd - data : symbol or ndarray - axis : int, default -1 - eps : float, default 1E-6 - - Returns - ------- - ret : mx.sym or mx.nd - """ - ret = data / (F.np.linalg.norm(data, axis=axis, keepdims=True) + eps) - return ret - - # TODO(sxjscience) Default to einsum. Current it is not the default because # 1) einsum is super-slow: https://github.com/apache/incubator-mxnet/issues/18043 def dot_attn_score(F, query, key, scaled=True, normalized=False, eps=1E-6, diff --git a/src/gluonnlp/embedding/__init__.py b/src/gluonnlp/embedding/__init__.py new file mode 100644 index 0000000000..73b1b54178 --- /dev/null +++ b/src/gluonnlp/embedding/__init__.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=wildcard-import +"""Word embeddings.""" + +from . import embed_loader +from .embed_loader import * + +__all__ = (embed_loader.__all__ ) diff --git a/src/gluonnlp/embedding/_constants.py b/src/gluonnlp/embedding/_constants.py new file mode 100644 index 0000000000..1c7921d313 --- /dev/null +++ b/src/gluonnlp/embedding/_constants.py @@ -0,0 +1,1002 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=too-many-lines +"""Constants.""" + +GLOVE_NPZ_SHA1 = \ + {'glove.42B.300d': ('glove.42B.300d.npz', + '7deee8f4860744db53ed9e50892effe9883e6d89'), + 'glove.6B.100d': ('glove.6B.100d.npz', + '01f80f202fcabcc3e0804898349087bfc191dd1c'), + 'glove.6B.200d': ('glove.6B.200d.npz', + '5e6e2bdab346c257f88d80d215d518e680d86e32'), + 'glove.6B.300d': ('glove.6B.300d.npz', + '1db264aa936be62f055dfb72854204450bdf4399'), + 'glove.6B.50d': ('glove.6B.50d.npz', + 'aa16be8d184399d2199f83fd62586f2c30497bfa'), + 'glove.840B.300d': ('glove.840B.300d.npz', + 'b4ba390c1154736e07c0e67d9180935f5930e83c'), + 'glove.twitter.27B.100d': ('glove.twitter.27B.100d.npz', + '0f7b82c223451d0002f79ba23596983cdbe0e2b1'), + 'glove.twitter.27B.200d': ('glove.twitter.27B.200d.npz', + '41cc2d26f58a54622ce96bf6c8434360ab524f20'), + 'glove.twitter.27B.25d': ('glove.twitter.27B.25d.npz', + '9f563d2f296995598cc46812b2fda05ad4c3c879'), + 'glove.twitter.27B.50d': ('glove.twitter.27B.50d.npz', + 'ce9959c056f2a0a780c468feeb4f823af51630e9')} + +WORD2VEC_NPZ_SHA1 = { + 'GoogleNews-vectors-negative300': ('GoogleNews-vectors-negative300-be6d6f98.npz', + 'be6d6f98609bd65db8f6991ecaed923f1b1c8377'), + 'freebase-vectors-skipgram1000-en': ('freebase-vectors-skipgram1000-en-6086803e.npz', + '6086803e4fd0b60e12b79031d585ef2c63ca71e6'), + 'freebase-vectors-skipgram1000': ('freebase-vectors-skipgram1000-827a5d3a.npz', + '827a5d3a777ea3de21be4f61dad4de6510d77ee3') +} + +FAST_TEXT_NPZ_SHA1 = { + 'crawl-300d-2M': ('crawl-300d-2M.npz', '9dd611a1fe280c63050cd546d3595400fc0eede4'), + 'crawl-300d-2M-subword': ('crawl-300d-2M-subword-927782c8e.npz', + '927782c8ec8c2c1deb6a8a550217478e183ca25a'), + 'wiki.aa': ('wiki.aa.npz', '48f163b80eb37f1806142169d3d4c05cf75b7339'), + 'wiki.ab': ('wiki.ab.npz', '860ceff119dd27e5b701b605879037c1310cbc3e'), + 'wiki.ace': ('wiki.ace.npz', '62938287464040491719f56a6f521f8f808beee8'), + 'wiki.ady': ('wiki.ady.npz', '646843afa260d018ed711df3f1ca9c3e000447b6'), + 'wiki.af': ('wiki.af.npz', '7b14cd27690b67fea318d0bac2283c16430680e2'), + 'wiki.ak': ('wiki.ak.npz', '20f309adad1c45958c97b6055d5838e05bbaea72'), + 'wiki.als': ('wiki.als.npz', 'a8b03aa133c4f7da12fc27c2b167b7918b1e9805'), + 'wiki.am': ('wiki.am.npz', 'ed3dd10cea64737f7a1623612ee099df9dc19f66'), + 'wiki.ang': ('wiki.ang.npz', '8efe64706d9d6b8eae38b2c7ff0b277e20592bc7'), + 'wiki.an': ('wiki.an.npz', '168046283c719ab96a29b1abae2e25a6575c7be8'), + 'wiki.arc': ('wiki.arc.npz', '049021b7decea4bc009b12936e56b4dbf5b760e7'), + 'wiki.ar': ('wiki.ar.npz', '7e325e1e98dfcdc9368d2ebe40ee834a2ed44912'), + 'wiki.arz': ('wiki.arz.npz', '7d851c2c7be3ee6f7fd896de7b76ea08e3fb08b0'), + 'wiki.as': ('wiki.as.npz', '01d38c29cd4bd99c1a8534abc058822da14a5b9c'), + 'wiki.ast': ('wiki.ast.npz', '9c9846ba5084505a0adea89c95c66e04efbf5ce9'), + 'wiki.av': ('wiki.av.npz', '7ef6a920c364638504e673cfde5f7675503fa81e'), + 'wiki.ay': ('wiki.ay.npz', 'c1202e110930e3902397f5cb64a8359e013b469f'), + 'wiki.azb': ('wiki.azb.npz', '10351b7ef14ec2cb610d290cb6a3f6987ef5d8b3'), + 'wiki.az': ('wiki.az.npz', '74257c3bcd533a606afae509ea835dc036d61546'), + 'wiki.ba': ('wiki.ba.npz', '4a2857ed694d66864df562b376c2fa12fcb03646'), + 'wiki.bar': ('wiki.bar.npz', 'e65c6b7e9ff83798d1eea05d166148837d53e615'), + 'wiki.bat_smg': ('wiki.bat_smg.npz', '6420584ae28ba6c9dd145fea8f096243d457c2d8'), + 'wiki.bcl': ('wiki.bcl.npz', '33606c970ab336b678393e2bdb8af2116d11cf7b'), + 'wiki.be': ('wiki.be.npz', '84487d341e333344cf71bc12c7a205d923762498'), + 'wiki.bg': ('wiki.bg.npz', '56f2a175b1a1d1a9cf9f1cea277cd0b46ffd7f66'), + 'wiki.bh': ('wiki.bh.npz', '07473989853a344a41aaa18f41030dc56d0d01c7'), + 'wiki.bi': ('wiki.bi.npz', '08adfa3c9ef3016d30ef69ea539d217ff67eda09'), + 'wiki.bjn': ('wiki.bjn.npz', '998a551283222931d3a26922308449950bfa3ec7'), + 'wiki.bm': ('wiki.bm.npz', '454ff9fbd4790e4a076d9a2087a51da28aa1332f'), + 'wiki.bn': ('wiki.bn.npz', '1f36f6f39c9a9b33bb8035c9a4dc7e04933604fd'), + 'wiki.bo': ('wiki.bo.npz', 'b9fe87318428de0a7790de175b5fec80c5af482d'), + 'wiki.bpy': ('wiki.bpy.npz', '5c7853173d27e2c018c24eca69de8d5f34511b0d'), + 'wiki.br': ('wiki.br.npz', '7aa66a2034fbfaa1d39e637385d48610238797c9'), + 'wiki.bs': ('wiki.bs.npz', 'a019a4677677c2e9e4d899326b2b6c15ad6c011a'), + 'wiki.bug': ('wiki.bug.npz', '09ae3477941d7a99d1df494368d7efb0b2c18913'), + 'wiki.bxr': ('wiki.bxr.npz', 'b832c691b8ddd95896c052d3d15e1f98d72068d5'), + 'wiki.ca': ('wiki.ca.npz', '391e0d4daad08649251274fa1cc2a5f49c7728b1'), + 'wiki.cbk_zam': ('wiki.cbk_zam.npz', '02e57a763bc9f9eadaba57953383dd12a0a78a37'), + 'wiki.cdo': ('wiki.cdo.npz', 'd6e8f422327e8b2273f1f2662d793707ece6695d'), + 'wiki.ceb': ('wiki.ceb.npz', '23bc0bb9aeaa57dff35092766941a866de142aae'), + 'wiki.ce': ('wiki.ce.npz', '182b2a889256119a6d379d501c55c7621e5855db'), + 'wiki.ch': ('wiki.ch.npz', '82dd77512fcb463481f43c9cef3507e2baa90d7b'), + 'wiki.cho': ('wiki.cho.npz', 'b0b620fc2442d1a6e2440e71a424861c80175f0c'), + 'wiki.chr': ('wiki.chr.npz', '3d62c6b95c5af46abd6234426ae760cca65d5bd0'), + 'wiki.chy': ('wiki.chy.npz', '34a28a22da79aebc100e3714b825c95c8d5f54a3'), + 'wiki.ckb': ('wiki.ckb.npz', 'ad19461e4be583d08b7693ff5b1e9d590ed41add'), + 'wiki.co': ('wiki.co.npz', 'fa60d9f0e79f1c7e15f381aef983a0f4f31c05a8'), + 'wiki.crh': ('wiki.crh.npz', '540270ba6edd9d7b2f7efca52b3b407524ac67d1'), + 'wiki.cr': ('wiki.cr.npz', 'f06b77465a38ec960d7d5a7554b848c37e945c76'), + 'wiki.csb': ('wiki.csb.npz', 'b8b28559cf2541341af98e2aa755856765bdeabf'), + 'wiki.cs': ('wiki.cs.npz', '19881e931fe06abf341450f00c342d364313e232'), + 'wiki.cu': ('wiki.cu.npz', '731e0d00abd53bc2a8eb6cf37f6ab883cff34e15'), + 'wiki.cv': ('wiki.cv.npz', 'e60034fcffb7dfef7b236ddba1194c3aa20b7967'), + 'wiki.cy': ('wiki.cy.npz', '5a0fb967b5556f007c0d5065f951a3d3b1c1005a'), + 'wiki.da': ('wiki.da.npz', 'd06258014ba2c7450bc2d55edfdf1731433e42e5'), + 'wiki.de': ('wiki.de.npz', 'a21694dfd2af63bd7bb00f0b60b28e88bd1153f1'), + 'wiki.diq': ('wiki.diq.npz', '4f6c77a86b39834a7130419967759afd8cc26b84'), + 'wiki.dsb': ('wiki.dsb.npz', 'e74f1d346a8db96987bff0c33ee5f886907c380a'), + 'wiki.dv': ('wiki.dv.npz', '5d6fe6f0eec2e7704121d5aba03b4edbb28af873'), + 'wiki.dz': ('wiki.dz.npz', '77c639d36d0355b2de5adead7996eae342b852a6'), + 'wiki.ee': ('wiki.ee.npz', '4b5a76127d57515d3e8a76787cdefde5856b754a'), + 'wiki.el': ('wiki.el.npz', 'a00bcb97e7898931196a1c69f7a492e5b6202661'), + 'wiki.eml': ('wiki.eml.npz', 'b475d626b3d97e7a68c02827fdc7900599e838c6'), + 'wiki.en': ('wiki.en.npz', 'ad5ec6d49db6c6fe76b8e85ff05d34e5d0e1eb6a'), + 'wiki.eo': ('wiki.eo.npz', '18049b0010520d13e676f5a82e8bb90153d99003'), + 'wiki.es': ('wiki.es.npz', 'a6d192ba7d82d762f8367e75ca951aad4d11e410'), + 'wiki.et': ('wiki.et.npz', '4beb7025cf88f1aa62d025b187f0cb09aee61858'), + 'wiki.eu': ('wiki.eu.npz', '5e1a8197e35f20a2476798bbb935b4c131289c4f'), + 'wiki.ext': ('wiki.ext.npz', '049b2d1b0a8b102b45907cf487cac30aa294e0a0'), + 'wiki.fa': ('wiki.fa.npz', '81ed274997c87ef87d73d25e166ca06272ce426f'), + 'wiki.ff': ('wiki.ff.npz', '4867dc74cd53ca0b0f769af4fa1ea420406b59bf'), + 'wiki.fi': ('wiki.fi.npz', '6d1291b854045179f8171ac7d62ede7d8ac159a2'), + 'wiki.fiu_vro': ('wiki.fiu_vro.npz', 'dd87806d9dc8833fa0e21e35a50815ebdbaa6c8b'), + 'wiki.fj': ('wiki.fj.npz', 'cf5c31b0a69276f5dd18ab738ed92444abaeb755'), + 'wiki.fo': ('wiki.fo.npz', 'ffc19807d528af000861a94cfb8097bd686e14fc'), + 'wiki.fr': ('wiki.fr.npz', '8f06d5dbe3cf7214354fe9b2f6eca0ef7419f063'), + 'wiki.frp': ('wiki.frp.npz', 'c8b200ae592478d3cd0bfaafcd7aa19de8a3bfe5'), + 'wiki.frr': ('wiki.frr.npz', 'fa5e5c39ea2a45793c679eacea290a35e37405ea'), + 'wiki.fur': ('wiki.fur.npz', 'a61a8940d059f25000e3fe23933e5ed0d37e65d3'), + 'wiki.fy': ('wiki.fy.npz', '46f9f41bdf6f4fb8e27a753290413d745465963b'), + 'wiki.gag': ('wiki.gag.npz', '49fb01230e6803544122d47ab7d3fe694d1444f2'), + 'wiki.gan': ('wiki.gan.npz', '716b7b26acc15975f30caf3c6effa111516fcca5'), + 'wiki.ga': ('wiki.ga.npz', 'ea934bc1fdc1acf6caf9ac746c6c499251f1fdee'), + 'wiki.gd': ('wiki.gd.npz', '597017b5a32d933f194595d3656f858e37e70a62'), + 'wiki.glk': ('wiki.glk.npz', '91a5834658bc2d48714e8807ef24efb79567b4b5'), + 'wiki.gl': ('wiki.gl.npz', '2fa8e48d6ae1e9c9d542eb3f2156cf9e359e66c2'), + 'wiki.gn': ('wiki.gn.npz', 'e359eef3928e1f1b5d8fcf0ea532e8794c66289a'), + 'wiki.gom': ('wiki.gom.npz', '8cd361481c23f7545cc2bd8f1bf22aa7400edd4d'), + 'wiki.got': ('wiki.got.npz', 'd05daf105611150695e61775fdff2c500b36be3f'), + 'wiki.gu': ('wiki.gu.npz', '0ce175c5fc39bab4032892f70c9d2bb850af0f4a'), + 'wiki.gv': ('wiki.gv.npz', '2c573f873d607831ff01b64603c17b8db79bd7e1'), + 'wiki.hak': ('wiki.hak.npz', 'e6048727799cdf149f5c50037e0fc59300d33a94'), + 'wiki.ha': ('wiki.ha.npz', 'f18ea7286bbd390c5470896b2c99cb1adc740064'), + 'wiki.haw': ('wiki.haw.npz', '18bcd85d2e06b1b889f0835fc5b62697fdf32d72'), + 'wiki.he': ('wiki.he.npz', '76915ff167b6ecb7b7e22ff0ca46914a55d344af'), + 'wiki.hif': ('wiki.hif.npz', '12153aaf98d76d5502ab77a27cd0b9a539f61513'), + 'wiki.hi': ('wiki.hi.npz', '249666a598991f6ec147954c6af9e531fd1cd94e'), + 'wiki.ho': ('wiki.ho.npz', '3f804fd69780c0789708b56ea9d48715f8e38f26'), + 'wiki.hr': ('wiki.hr.npz', '9a3de28e69f97048bfb480b4f83eaab6149f66ad'), + 'wiki.hsb': ('wiki.hsb.npz', '7070bf64e13299dd66ac0e9f8e24011a56b6bfe8'), + 'wiki.ht': ('wiki.ht.npz', 'a607093d511afeb584d02dc676bc5a27eff66287'), + 'wiki.hu': ('wiki.hu.npz', '9b2c4750daf1bcf39768572e874b5afda0e2f0bc'), + 'wiki.hy': ('wiki.hy.npz', 'ec0461a102a6fb00bd324f66cefd3c8d55a7093a'), + 'wiki.hz': ('wiki.hz.npz', '5dfb8afbdae6b4148c3e55ab459c56a74b46b463'), + 'wiki.ia': ('wiki.ia.npz', '4cfaaf053b9513bbf5b2423258c0f01d20256de6'), + 'wiki.id': ('wiki.id.npz', 'bace396bb9941cc9e5b2e5f5a19be6db833c5fd4'), + 'wiki.ie': ('wiki.ie.npz', '1bae7256c2e763ce6d692d1c0a603d99a8b22826'), + 'wiki.ig': ('wiki.ig.npz', '23128e54a5e143891d392d621723bad9cfc8cf7b'), + 'wiki.ii': ('wiki.ii.npz', '54bc16d05da512481865a89ecf30260b0acc04dc'), + 'wiki.ik': ('wiki.ik.npz', 'f8015227e893d2375699b7d132b306ba381f02ac'), + 'wiki.ilo': ('wiki.ilo.npz', '185a11f81bd5d24a34558dda81ee4735f5ba150b'), + 'wiki.io': ('wiki.io.npz', 'ddf8180a90aa6ee5be93a2582cc99c535f21363e'), + 'wiki.is': ('wiki.is.npz', '968f8dd2a093b279a6f7aaa734008454bf51d724'), + 'wiki.it': ('wiki.it.npz', 'fdfb857a309b2c3d29482bb5cc55f21b858d2e6f'), + 'wiki.iu': ('wiki.iu.npz', 'fa8896730bd6c24c3473daa22116d1016294e7f7'), + 'wiki.jam': ('wiki.jam.npz', 'a8f0d0b99c89ace0a6401b8fcda261d06065faaf'), + 'wiki.ja': ('wiki.ja.npz', '8d42e5a40e4d1d8645b2d80b873a65cadcf68b5c'), + 'wiki.jbo': ('wiki.jbo.npz', '145fc999ab004b348cf9bf445f0a93a7a145308b'), + 'wiki.jv': ('wiki.jv.npz', '66978770bf06e42414395cf5fd8c596044d72bec'), + 'wiki.kaa': ('wiki.kaa.npz', '624a640ecb9901b2aba2e9f44ab615146ecb2862'), + 'wiki.kab': ('wiki.kab.npz', 'e97f93b6ba65e95c85b7541932cf53c5ad9eb896'), + 'wiki.ka': ('wiki.ka.npz', '1ca8376e1e0cbd58001c1b51a2d488a2874a6743'), + 'wiki.kbd': ('wiki.kbd.npz', 'f2d2a05b06723ac549784ad5470d84f5742a1352'), + 'wiki.kg': ('wiki.kg.npz', 'fa7f6d5f660a173a3e75342d449980eedcdc789e'), + 'wiki.ki': ('wiki.ki.npz', '21a8c7c616c0050c51c288861f3423f313e4f634'), + 'wiki.kj': ('wiki.kj.npz', 'f3c347509a0d81f4f7fdbb8b22889b8d76e5014e'), + 'wiki.kk': ('wiki.kk.npz', 'bc24a3289e1c1e18e16b6789c2f9f92af1e73071'), + 'wiki.kl': ('wiki.kl.npz', 'b8b7e7359f067836e2be2ecfe9f35a820b00fe1d'), + 'wiki.km': ('wiki.km.npz', 'e053799fd01463808432dc035bef3e36620e2f36'), + 'wiki.kn': ('wiki.kn.npz', '2849a0a8b3453e9bf6af05d4c7bd3db881dd1068'), + 'wiki.koi': ('wiki.koi.npz', 'a9b02e9bd41833bcd54769f94626019c03f29997'), + 'wiki.ko': ('wiki.ko.npz', '764d9896e74b5a26c6884d48bce3bed8ed3a7822'), + 'wiki.krc': ('wiki.krc.npz', 'bfe39598c718f1cc95909db7544b3214b308a97c'), + 'wiki.kr': ('wiki.kr.npz', '1e6af853d4a8ea7830e116eb9b61ac5d7d9a315c'), + 'wiki.ksh': ('wiki.ksh.npz', '66cd0e3e0a0b0282a13960571ebe7cddd7706bf2'), + 'wiki.ks': ('wiki.ks.npz', '85f1adaa05b854df4dede745a1aaab3836e60770'), + 'wiki.ku': ('wiki.ku.npz', 'faf90584e5a45e6d0f9eeb88399b82abe037d584'), + 'wiki.kv': ('wiki.kv.npz', '9f2b41822013a412da9c99fac06eed8be03ca192'), + 'wiki.kw': ('wiki.kw.npz', '3eed8a8fc97a2fc79241b8474a458c98d00fc897'), + 'wiki.ky': ('wiki.ky.npz', '0116ff90f10a6c0728e1ea86d8a44896ea83270a'), + 'wiki.lad': ('wiki.lad.npz', '5af2015b3d1c5e8563f0e92721580988ebe2ce50'), + 'wiki.la': ('wiki.la.npz', '7143303a3ea13c7668eb90ea6e3d2ca69857a3be'), + 'wiki.lbe': ('wiki.lbe.npz', 'f206a3c35a184ba5d2b32ee68640eadf66c847da'), + 'wiki.lb': ('wiki.lb.npz', '143dc6337f3690379282034c460c613d7f144923'), + 'wiki.lez': ('wiki.lez.npz', 'b29a680decc6b29f24e8eb9e4f8e11e3419d45f1'), + 'wiki.lg': ('wiki.lg.npz', '866640ce62cedbc1d453b7ea3c289c291ad76e13'), + 'wiki.lij': ('wiki.lij.npz', '0dcd3d7009ae89b1016ca6cdb99a9f0d70bc4baf'), + 'wiki.li': ('wiki.li.npz', '4666b3c238256d7b7623a136db19b8b9f4754734'), + 'wiki.lmo': ('wiki.lmo.npz', 'ac89fa7cfe0675950bcb31c66bf3f88a3cfc98f0'), + 'wiki.ln': ('wiki.ln.npz', 'fba158719944aabe58e0002a90be0ed77e11702d'), + 'wiki.lo': ('wiki.lo.npz', '1e113e340a8a93d385e14502c9c4e3bcdf6c3101'), + 'wiki.lrc': ('wiki.lrc.npz', '42cb755f398fba6f0da7949c91e92b55654bd482'), + 'wiki.ltg': ('wiki.ltg.npz', '182f75859e228d1162215f28fe7f2dca127624a4'), + 'wiki.lt': ('wiki.lt.npz', '66aa944bd2e777cb82d6d59b1f2f837b6c48cb37'), + 'wiki.lv': ('wiki.lv.npz', '2be8f926da85694fa998bf79d80b61ebb8d67576'), + 'wiki.mai': ('wiki.mai.npz', 'b8a9c36e2a0f1bb84a44dc762250d2a9007ef637'), + 'wiki.map_bms': ('wiki.map_bms.npz', '6f0394d6b3d08a946e3df4b9355efe94148f018a'), + 'wiki.mdf': ('wiki.mdf.npz', '774ee35334641db57f9ac9069961c5372a5d92e8'), + 'wiki.mg': ('wiki.mg.npz', '496c48ef668f08ce95ebb11ce1ce5026b52d935c'), + 'wiki.mh': ('wiki.mh.npz', '352edd84f99c5aa277a7306f6cacea1fab065ed3'), + 'wiki.mhr': ('wiki.mhr.npz', 'dd78b27a674ac10411cdf74ac32f9391506b17e0'), + 'wiki.min': ('wiki.min.npz', '628b406441ab03bc8aa68195ada50bfdc8226f34'), + 'wiki.mi': ('wiki.mi.npz', '754127b473861cd4f9ae034c9f527a34827b1f00'), + 'wiki.mk': ('wiki.mk.npz', 'b09fed4f56c296f13c4020ef1fec498382a38b73'), + 'wiki.ml': ('wiki.ml.npz', '02fb55d97ca2f0408f0e7e8dd6a661bbc3319a2a'), + 'wiki.mn': ('wiki.mn.npz', '08b2c45689aa5d9ec49df96dc7c777ce9b9a0b4b'), + 'wiki.mo': ('wiki.mo.npz', '638c2e8bd2352fd52921b9ae62f578b8357bab49'), + 'wiki.mrj': ('wiki.mrj.npz', 'ec5cf1f4fb8dfdca64d8172974e620eb8fa41626'), + 'wiki.mr': ('wiki.mr.npz', '074dd68c947c2f137a3e84b55012925f00213139'), + 'wiki.ms': ('wiki.ms.npz', '3dbe9e9d70251de8a374776ff1250a9c3103ee59'), + 'wiki.mt': ('wiki.mt.npz', 'f5103998a68d1b178387417436a83123d44aba01'), + 'wiki.multi.ar': ('wiki.multi.ar.npz', 'a010d1d81a465c56ebaf596b3e8e8795e7f0f8e3'), + 'wiki.multi.bg': ('wiki.multi.bg.npz', 'c04018f3a600cee170f12a36cdd35b4727a2aade'), + 'wiki.multi.ca': ('wiki.multi.ca.npz', 'eef52a0cf20c133ca9065de25f0702861a8cfa29'), + 'wiki.multi.cs': ('wiki.multi.cs.npz', 'c5f547aa78c0e3d7dae67a0334d500bf2a86aa30'), + 'wiki.multi.da': ('wiki.multi.da.npz', '24374f2ee169b33327feeee46da31b0de1622fe4'), + 'wiki.multi.de': ('wiki.multi.de.npz', '2e6c119b345bebd34b56eaaf855d6703889b11f7'), + 'wiki.multi.el': ('wiki.multi.el.npz', '9d122beedb80a2e5334946641e5bafd32c01e76b'), + 'wiki.multi.en': ('wiki.multi.en.npz', '8c3c480b4cb2690304173713a646280613b244a8'), + 'wiki.multi.es': ('wiki.multi.es.npz', '483a22656e4fb2a01e9f4ef8156b261e780850ab'), + 'wiki.multi.et': ('wiki.multi.et.npz', '22498c7b91645a3874fa738b5cfb16bf98b6f97c'), + 'wiki.multi.fi': ('wiki.multi.fi.npz', '765a6f0b63777bff4ae6ca2b461c5889c03d6a70'), + 'wiki.multi.fr': ('wiki.multi.fr.npz', 'decd9aacf600114b8a36072535c0309874a37c83'), + 'wiki.multi.he': ('wiki.multi.he.npz', '7eee940c1b85936f59122f4b1a166223dd946674'), + 'wiki.multi.hr': ('wiki.multi.hr.npz', '1673963416af088f8bf15576afb33d58115db35c'), + 'wiki.multi.hu': ('wiki.multi.hu.npz', 'a1fbe6ededf3cbaa3eaa22dd8b20cce4b36cfc6d'), + 'wiki.multi.id': ('wiki.multi.id.npz', '6c3e721febb511ede7db7bf978d65769e4270f5c'), + 'wiki.multi.it': ('wiki.multi.it.npz', 'fc5bfc11e0165e8d95c1708573dad5e456826c73'), + 'wiki.multi.mk': ('wiki.multi.mk.npz', '6cd50198355674f156fc863108d9bebf11cfabd9'), + 'wiki.multi.nl': ('wiki.multi.nl.npz', '4fa06b9230c95dfa5a9e9a5d80f1f5ba614d3cbf'), + 'wiki.multi.no': ('wiki.multi.no.npz', '63756168c1101e73fba8d1a5015f32b8892819e6'), + 'wiki.multi.pl': ('wiki.multi.pl.npz', '958b8e8bead965ba1bb1433e1c960fc3e12a10fb'), + 'wiki.multi.pt': ('wiki.multi.pt.npz', '22f07df1609d79b95344ee575ea43141424a1528'), + 'wiki.multi.ro': ('wiki.multi.ro.npz', '73180b3e382519004bf38ea7b86237aacbbe813a'), + 'wiki.multi.ru': ('wiki.multi.ru.npz', '3b2eb9163f35e90bf2ce1cd3c997b354d0c34f59'), + 'wiki.multi.sk': ('wiki.multi.sk.npz', '606a0c3ba9849070c6b6b8c22d920fdeed9a1385'), + 'wiki.multi.sl': ('wiki.multi.sl.npz', '3cfdab5043b8cfe1535cb6dbd4c9e68847ad5904'), + 'wiki.multi.sv': ('wiki.multi.sv.npz', '4f1494885b9a831e87cfa3c15f2204c4a73c0779'), + 'wiki.multi.tr': ('wiki.multi.tr.npz', '54f90d5ddb9a65538a41e37c5a67ed933a5e4885'), + 'wiki.multi.uk': ('wiki.multi.uk.npz', '500fd26b1d7a25b42458012e99f9f76642e0c787'), + 'wiki.multi.vi': ('wiki.multi.vi.npz', '3955809cceb300965c15f9372221417719bb0db8'), + 'wiki.mus': ('wiki.mus.npz', 'a5f48934a3fa6eaf4929098046c93fc94dd6bcb6'), + 'wiki.mwl': ('wiki.mwl.npz', '8a5e2c272166f8a72c5694ca6c3104d5f49179ec'), + 'wiki.my': ('wiki.my.npz', '5e035aca16700d7d6695af8a6d3a88ac847aaeb7'), + 'wiki.myv': ('wiki.myv.npz', 'd4cfaab70c640033e02c0fc0c5a3615ae836c569'), + 'wiki.mzn': ('wiki.mzn.npz', 'ad09ac584ae455b5862b95125ef409360ae18445'), + 'wiki.nah': ('wiki.nah.npz', '2dc454ef37d059f2053af46cfa1f4f0ca939cba0'), + 'wiki.na': ('wiki.na.npz', '401f0f880eb7aa78d21348bc1e0a3953b3e81bf0'), + 'wiki.nap': ('wiki.nap.npz', '996da46aeeab5644ba766d00c5e343b1553361d7'), + 'wiki.nds_nl': ('wiki.nds_nl.npz', '5a9307e16b13a5a82ec19a52b33254537e7198e7'), + 'wiki.nds': ('wiki.nds.npz', 'b249a87c78c52becf51e7b50aaf9f9b6a36585f1'), + 'wiki.ne': ('wiki.ne.npz', 'a601db2647a74ffd2b4b43dcb8584735f555459c'), + 'wiki.new': ('wiki.new.npz', 'c398a3775aba9c68ce765cfdfb6b188f7c47e4c6'), + 'wiki-news-300d-1M': ('wiki-news-300d-1M.npz', '0a03bbd508e5381e140476140fb121afeb0050ed'), + 'wiki-news-300d-1M-subword': ('wiki-news-300d-1M-subword.npz', + '69edae21375407781c727dcb9e534e79d712d137'), + 'wiki.ng': ('wiki.ng.npz', 'befd774d15f69d43547e13e5ea3a97c4cb1ab405'), + 'wiki.nl': ('wiki.nl.npz', '5a7cb6f1dd0a7621202abba9461ac2c5bf905219'), + 'wiki.nn': ('wiki.nn.npz', '8e5059ddeb24050fadaa5cc4622b13feb3e4a226'), + 'wiki.no': ('wiki.no.npz', '5ce6e0f793e66f081652f64013968099de03d9f9'), + 'wiki.nov': ('wiki.nov.npz', '95ed23b4cfd7a65afa1c12c7dbdce6af53923d77'), + 'wiki.vec': ('wiki.npz.npz', '08ebb912efeb9df1c7d05e1af90484d210dff47e'), + 'wiki.nrm': ('wiki.nrm.npz', 'e58614b4508ff9810f0b58fd818f973775bc918d'), + 'wiki.nso': ('wiki.nso.npz', '56a2ebe260241402d117cd89c5c872b9c96ff05b'), + 'wiki.nv': ('wiki.nv.npz', 'c713051fe03ec1f60314bb42161b2a47fb5e169a'), + 'wiki.ny': ('wiki.ny.npz', 'ba5a1725955cbc13e7fd93ab499f8085840c992c'), + 'wiki.oc': ('wiki.oc.npz', '259e7d994c38a4cfc140fb07016b82d6781e5027'), + 'wiki.olo': ('wiki.olo.npz', '0fea70f887def4779ee70a79366b88f1ada65004'), + 'wiki.om': ('wiki.om.npz', '47e2d756b5f8913085d901375c1b4e0b118a4221'), + 'wiki.or': ('wiki.or.npz', '7e274ab060219b019aa02bb97941cc6e162fd01f'), + 'wiki.os': ('wiki.os.npz', '19e8199cc2aaffdb07b6c558dbc5465ac6e03155'), + 'wiki.pag': ('wiki.pag.npz', 'eddf4931547649026c02f893297ef673ec6158bb'), + 'wiki.pam': ('wiki.pam.npz', '40109aa174bd9f0fa657839bb548e2b0646c58d3'), + 'wiki.pa': ('wiki.pa.npz', '8a5870717e9e641b1f757f13259171698118de2e'), + 'wiki.pap': ('wiki.pap.npz', '999c8e5b005ca20d9998fbbe4fa79177f69e24c0'), + 'wiki.pcd': ('wiki.pcd.npz', 'e975066b323a65cdc5e4c27138ef674d2cf7250b'), + 'wiki.pdc': ('wiki.pdc.npz', '5c770b9d56f276b0aa535845f175c05ee1cea615'), + 'wiki.pfl': ('wiki.pfl.npz', '0063d0b633ee529a75482b36ed4f4da7d64994ec'), + 'wiki.pih': ('wiki.pih.npz', 'ce1d76c94d248545eea0d7436c54849dbb380bfc'), + 'wiki.pi': ('wiki.pi.npz', 'c7d56c334bf529f8b3655693d207a80feaec4aed'), + 'wiki.pl': ('wiki.pl.npz', '0d612fdf871a1a4084c867f394940475be899443'), + 'wiki.pms': ('wiki.pms.npz', 'ca149a2fb138011315bb6d5d61c7a5647e515e51'), + 'wiki.pnb': ('wiki.pnb.npz', '9ec82d02ad8894056c67991cf8ce927bcca74ee2'), + 'wiki.pnt': ('wiki.pnt.npz', '3f90123407bb8fc838a0a0d3700a14e15f5b26aa'), + 'wiki.ps': ('wiki.ps.npz', '7edebc02ac16f5fab83eb10b7d0fab821a9a4d43'), + 'wiki.pt': ('wiki.pt.npz', 'f172fd801edd1ad9d319ba44146d40b5d682a473'), + 'wiki.qu': ('wiki.qu.npz', '68bec60ccfe1826c3b3a8968574488dbc74cdf7b'), + 'wiki.rm': ('wiki.rm.npz', '00fb191fc736ba60cb23e76169dfccde9a9daad0'), + 'wiki.rmy': ('wiki.rmy.npz', 'c5e93cc37ff7293b9a1d9fe55c42d6fbde372b97'), + 'wiki.rn': ('wiki.rn.npz', '57b8e0d6999269be227af6ef2797a9cf8386ff1b'), + 'wiki.roa_rup': ('wiki.roa_rup.npz', 'e06d6b5672a59bb9e83143bc8b28300d23c09546'), + 'wiki.roa_tara': ('wiki.roa_tara.npz', 'c083105f40236dc3711f06c1b40e8ee7a714b99d'), + 'wiki.ro': ('wiki.ro.npz', '766bc0cb58a65b0b1763b9a0d90e91ab982eb20d'), + 'wiki.rue': ('wiki.rue.npz', '9a91fa093cd48d7d658d526b0ccda48dc59cd7f4'), + 'wiki.ru': ('wiki.ru.npz', 'd59d099481c22d5592ab9635c9ee48060aa0bf45'), + 'wiki.rw': ('wiki.rw.npz', 'e99ee87d249f6c157c5c97397d1025d798b85c69'), + 'wiki.sah': ('wiki.sah.npz', '85dae39097b29bc8e2b64f343a77794e4a62f91a'), + 'wiki.sa': ('wiki.sa.npz', '7d1928d7c67400045ac1b35a37a0e3089690d875'), + 'wiki.scn': ('wiki.scn.npz', '27d7b8050bbeed8ce196061c610216760b053c39'), + 'wiki.sc': ('wiki.sc.npz', '69c7b8be0f03a1bbd615695f93bdd78f96a58e16'), + 'wiki.sco': ('wiki.sco.npz', '4880282f59d3338b67fbff75359e2d24896e95bb'), + 'wiki.sd': ('wiki.sd.npz', '0ed8da4d27223db717a612cf0c88582351db6e19'), + 'wiki.se': ('wiki.se.npz', '0f4b2e060d5e29f96ca73aab29c967e79db69c17'), + 'wiki.sg': ('wiki.sg.npz', 'a5e4edf34fe1a88b322da4c3922ec5a470e200c6'), + 'wiki.sh': ('wiki.sh.npz', 'c13f1e94676bc939560193f7aa7ffd7d604707b3'), + 'wiki.simple': ('wiki.simple.npz', '352d0575e7d60b08e1dfce2c5de713906f0ed78f'), + 'wiki.si': ('wiki.si.npz', '204f9ffbe7770a9f56d3b2fb26999165015f5c33'), + 'wiki.sk': ('wiki.sk.npz', '7a9820b5a343b242660bf2595d1ecbf6e00a76d6'), + 'wiki.sl': ('wiki.sl.npz', '85f3186f26d6725317a64e290363a7251b928b81'), + 'wiki.sm': ('wiki.sm.npz', '9e13452cc4bff677f4f15db04f9d2f95f6ec054c'), + 'wiki.sn': ('wiki.sn.npz', 'e8d5f7dcf51280c5f99bc3df849b4889a61e9fcd'), + 'wiki.so': ('wiki.so.npz', '0f5d71b95768b33fd939a870c15344c4478364a9'), + 'wiki.sq': ('wiki.sq.npz', '8b05826df8575e65c87a2fc0b7630cf644d4216d'), + 'wiki.srn': ('wiki.srn.npz', '2711396ef297ac5dde8904508bc002bdecbcc6f4'), + 'wiki.sr': ('wiki.sr.npz', '546edc8e29a5d2e99ed10eb4a552cbef2bb8f417'), + 'wiki.ss': ('wiki.ss.npz', '2e5911bad79bb5270a64f587e326d31c95ec58f3'), + 'wiki.st': ('wiki.st.npz', '23bc954719a2962e891f02efaea754c9ea025894'), + 'wiki.stq': ('wiki.stq.npz', 'dd3ece0c0aa30e53ae0f4b558309bb60ab628652'), + 'wiki.su': ('wiki.su.npz', '7e48732e8a1fcf212e692924a4416a6ac3b3b055'), + 'wiki.sv': ('wiki.sv.npz', 'b9ec52e9423688f195f3145c243226c0e0b51e83'), + 'wiki.sw': ('wiki.sw.npz', '5262f0c645322b10eca73f792a970f10b2719e55'), + 'wiki.szl': ('wiki.szl.npz', 'fdd6d6b291cdbbcec5ff93451a588fdd103bb2d0'), + 'wiki.ta': ('wiki.ta.npz', 'da7c5bc6e1142306ff2669bf1739832beb6c1763'), + 'wiki.tcy': ('wiki.tcy.npz', 'baa49e1afa2bb0dcaaef0fac1ee75bbe711d1134'), + 'wiki.te': ('wiki.te.npz', 'baf48767ce85e4d41d65d25f2bbf1c5f559ec18f'), + 'wiki.tet': ('wiki.tet.npz', '11e46a893af55344dbe102d530fdfea5d949d3bc'), + 'wiki.tg': ('wiki.tg.npz', 'da66abb72ec9ccc602713161e544963d59cc51d7'), + 'wiki.th': ('wiki.th.npz', '25e54bf2d305779ec9baa5f344410bd75c7702fc'), + 'wiki.ti': ('wiki.ti.npz', '1faf98f3a0eafa7559a4b2a111f43dd1f7b9a05b'), + 'wiki.tk': ('wiki.tk.npz', '34c714fa8275fd6abfe86b2d144a043774552a6c'), + 'wiki.tl': ('wiki.tl.npz', '7d7f8a0485155bce7a74a1d778824375b0029f53'), + 'wiki.tn': ('wiki.tn.npz', 'd0bc3a9b948753ac2283e5e10480c9fa0f6acb53'), + 'wiki.to': ('wiki.to.npz', 'e982fc31bcfcf7339988d7aad21ce29ac9e84b0b'), + 'wiki.tpi': ('wiki.tpi.npz', '448cef043fa4b7f97825dbf8ee205ef05543bcac'), + 'wiki.tr': ('wiki.tr.npz', 'c9830607a4c5134c6191006f1d80bae0ec798fe6'), + 'wiki.ts': ('wiki.ts.npz', '84a0598803712c8a713943447ddb73fc0f39af43'), + 'wiki.tt': ('wiki.tt.npz', '82c29df18f33e6284af3e977a6dda7e132a7a225'), + 'wiki.tum': ('wiki.tum.npz', '358990b894a3fb09d70674465952d828c9b0eda7'), + 'wiki.tw': ('wiki.tw.npz', '1e6d2838a4f271c1808795fb929cfcbf95094d93'), + 'wiki.ty': ('wiki.ty.npz', 'e41ca5192d8cb515b3561c8d6935b150deb027b7'), + 'wiki.tyv': ('wiki.tyv.npz', 'ce062ed32e854604714b65698ae290c99ba28060'), + 'wiki.udm': ('wiki.udm.npz', '9e1c5891ee0c5ac8f65fc457e1b42c7b2bfc8d37'), + 'wiki.ug': ('wiki.ug.npz', '656503e54063e200980e39f00fc011395bcd8551'), + 'wiki.uk': ('wiki.uk.npz', '352b7ee24d9fc6513fff4fe13bc04086c680834a'), + 'wiki.ur': ('wiki.ur.npz', 'a81e55c7adfc2cef779ce9a01fe21319a7e4943b'), + 'wiki.uz': ('wiki.uz.npz', 'd60d1e67bb8574dd71c18c88114aba674fc1eecb'), + 'wiki.ve': ('wiki.ve.npz', '5bfc3dbb3e47d23597df47ef12bd1c64ab8d3ea9'), + 'wiki.vep': ('wiki.vep.npz', '7a94355754fbe56802242c0bf9d7a27335095552'), + 'wiki.vi': ('wiki.vi.npz', 'f118039eb16a4ca3347b6b171eac41113350a041'), + 'wiki.vls': ('wiki.vls.npz', '9a46a2fdc6448aa54f212081643745499ea7d05c'), + 'wiki.vo': ('wiki.vo.npz', '8e2f93c85ac608bcc4ae14093b9ff016061378fb'), + 'wiki.wa': ('wiki.wa.npz', '907074f7743d30cdbb2c48d0c8b4040796ea4164'), + 'wiki.war': ('wiki.war.npz', '928fb410c394b9c18d875326b6a3e750e2611e1b'), + 'wiki.wo': ('wiki.wo.npz', '7bb352be44f7261aa926f49b13e77df30f29312f'), + 'wiki.wuu': ('wiki.wuu.npz', '0d1dc7b05867ff2156a1180ad3da3b4697924e59'), + 'wiki.xal': ('wiki.xal.npz', 'd87f4a131e086dc0bdc2a7e10406820c3c03b6a9'), + 'wiki.xh': ('wiki.xh.npz', 'c64e1d2e77d1c744a628e2bd7353284616e48bea'), + 'wiki.xmf': ('wiki.xmf.npz', '160b9ee9773b9099aaf37ae9bdbc8a4a93b7f6ea'), + 'wiki.yi': ('wiki.yi.npz', '0662542cee29f3392fc905004ac6443b32c1477c'), + 'wiki.yo': ('wiki.yo.npz', '5d12d3b902a1fa19d8548295c3802c0608afa5c8'), + 'wiki.za': ('wiki.za.npz', '536348ff89df62e968739b567a1245bfd4112fbe'), + 'wiki.zea': ('wiki.zea.npz', '61fa192289a7c0f73ffa8035632a38b91c31c224'), + 'wiki.zh_classical': ('wiki.zh_classical.npz', '9acc9eaf8ebe316b945fb1f56ac71a2b7e024854'), + 'wiki.zh_min_nan': ('wiki.zh_min_nan.npz', '5d38bc025c82af578299d60f7df7b399de6ed81a'), + 'wiki.zh': ('wiki.zh.npz', '94007fcf3b105bf2c21b84a3a22bdb7946e74804'), + 'wiki.zh_yue': ('wiki.zh_yue.npz', 'af6f0d94e6418d528d6cedd859e07e6e2fb416ab'), + 'wiki.zu': ('wiki.zu.npz', 'fc9ce07d5d0c49a3c86cf1b26056ada58f9404ca'), + 'cc.af.300': ('cc.af.300-6cf6fb1b.npz', '6cf6fb1b9f890787cbd3b510ef6201de9e02a297'), + 'cc.als.300': ('cc.als.300-479a6674.npz', '479a66746401f6119a7e4cba58ddfac5f9937ba6'), + 'cc.am.300': ('cc.am.300-0d9530cd.npz', '0d9530cd2b7e4bc9eac96048ed5cbf7d3cc9f799'), + 'cc.an.300': ('cc.an.300-ef9cb799.npz', 'ef9cb799a5a627a9d33b54604aff1593e0de3b40'), + 'cc.ar.300': ('cc.ar.300-e9c5e360.npz', 'e9c5e360d5cd050effd9ce42f831b9c94b7ffbd9'), + 'cc.arz.300': ('cc.arz.300-9e6a80e7.npz', '9e6a80e752f3830b5cc934884d0a65f8cf94eff4'), + 'cc.as.300': ('cc.as.300-8b00a681.npz', '8b00a681b079d56929bedf1ced16b3c9573c5b37'), + 'cc.ast.300': ('cc.ast.300-8681d5cb.npz', '8681d5cbbdfd89bed0a9335a8e6f28617185627a'), + 'cc.az.300': ('cc.az.300-06632ae7.npz', '06632ae75e85f75caa9b87f8327661333a504ccc'), + 'cc.azb.300': ('cc.azb.300-01c48025.npz', '01c480257a8343fd18d1d9011dd14ef5c3f124a2'), + 'cc.ba.300': ('cc.ba.300-afc6b4d1.npz', 'afc6b4d1e77964965fd8cc2ff31e077286af81e4'), + 'cc.bar.300': ('cc.bar.300-67450b87.npz', '67450b879ae9d6d5e1b7c7150baab9173563d38a'), + 'cc.bcl.300': ('cc.bcl.300-261d8d11.npz', '261d8d11ca9fe67a12de812976398722b30e1df2'), + 'cc.be.300': ('cc.be.300-cd32b101.npz', 'cd32b101a860ae629f05ee90beb2a8f137ed8bc2'), + 'cc.bg.300': ('cc.bg.300-088de1c6.npz', '088de1c633cbfe8e06405badd685bc04f2127cfd'), + 'cc.bh.300': ('cc.bh.300-35ced78e.npz', '35ced78e8dc524f17e7ba4a5326b6bbeb92af4e1'), + 'cc.bn.300': ('cc.bn.300-98293882.npz', '98293882fc548c2047f5482ea4dcd5eedef58f13'), + 'cc.bo.300': ('cc.bo.300-7653c3c7.npz', '7653c3c76bfb21a3f4e655b4a3d347d6b5e1ebc3'), + 'cc.bpy.300': ('cc.bpy.300-8225d2db.npz', '8225d2db3bbd34c1f27e491b41d35e1f4394f529'), + 'cc.br.300': ('cc.br.300-ac611b58.npz', 'ac611b58dd6006dc741170ea4066aeff8b8b7a0c'), + 'cc.bs.300': ('cc.bs.300-be17aeed.npz', 'be17aeedde87cdd173a3508edf2762311df5b369'), + 'cc.ca.300': ('cc.ca.300-8e7f57c1.npz', '8e7f57c190a9a4bd5513b9d268b5a26f55202fe0'), + 'cc.ce.300': ('cc.ce.300-7ef28422.npz', '7ef28422df058d6be07c819c195b9ce50be5a985'), + 'cc.ceb.300': ('cc.ceb.300-25801d42.npz', '25801d429ed046998749cb07526533b9c63bded2'), + 'cc.ckb.300': ('cc.ckb.300-c56f75b4.npz', 'c56f75b46cc38357f38f7de51d46e97c66381ca0'), + 'cc.co.300': ('cc.co.300-f9394c12.npz', 'f9394c1285bf6d5d33b1c28387535e1010a1d4c6'), + 'cc.cs.300': ('cc.cs.300-82823bbf.npz', '82823bbfc29ec94051b72079939eb66d5db4d1bf'), + 'cc.cv.300': ('cc.cv.300-6862885d.npz', '6862885d97a2c84792bf7f291338008757eb5790'), + 'cc.cy.300': ('cc.cy.300-c649c74c.npz', 'c649c74c0ca139d80e19b9a98b5f5d15386393d7'), + 'cc.da.300': ('cc.da.300-ee9246dc.npz', 'ee9246dc34da5fa1e2d49d37395b169598488324'), + 'cc.de.300': ('cc.de.300-713dc52f.npz', '713dc52f1c24b5c31e4104aee9d92e7e26bd6db9'), + 'cc.diq.300': ('cc.diq.300-fdc37a8e.npz', 'fdc37a8e2357e922bc27f945f1e080d80fbcae5a'), + 'cc.dv.300': ('cc.dv.300-d37b74b2.npz', 'd37b74b2247761723126b92183857f5e32a4f17c'), + 'cc.el.300': ('cc.el.300-888b3ecf.npz', '888b3ecfc9f3e16349d15cc85b5453a90f524529'), + 'cc.eml.300': ('cc.eml.300-01926a33.npz', '01926a335cb7055270066d07a9d050a786398adf'), + 'cc.en.300': ('cc.en.300-79da8fea.npz', '79da8fea1408d642ce43a5fdf40c3c803a49db2c'), + 'cc.eo.300': ('cc.eo.300-02accc23.npz', '02accc23007b196a0bab9be70dcfe911fb8fa87c'), + 'cc.es.300': ('cc.es.300-a0063528.npz', 'a00635289e65081d50fc46bb39203e48115b5d20'), + 'cc.et.300': ('cc.et.300-2916e309.npz', '2916e309a61ba0b83761ed9b4f75d959ad59247f'), + 'cc.eu.300': ('cc.eu.300-0257399f.npz', '0257399f1433ca579aeaf625f897b01c2f041438'), + 'cc.fa.300': ('cc.fa.300-d5aca585.npz', 'd5aca58546a99513e1f96b6df7a52b95ace2247a'), + 'cc.fi.300': ('cc.fi.300-ed53841e.npz', 'ed53841e29ebf6d701ee4c96a10c25f2ecc8a904'), + 'cc.fr.300': ('cc.fr.300-c87b8969.npz', 'c87b89697779a76650b5a583a5d682809d73a794'), + 'cc.frr.300': ('cc.frr.300-dd4b3bdf.npz', 'dd4b3bdf9d6df61f7e9e94cdf754dee46ec15fa7'), + 'cc.fy.300': ('cc.fy.300-7eb20794.npz', '7eb20794c65568e6fabe5c9df974d1951035a819'), + 'cc.ga.300': ('cc.ga.300-8d09df0e.npz', '8d09df0e774f7ac3fe7cafb8ed67dd58388743a9'), + 'cc.gd.300': ('cc.gd.300-606435bb.npz', '606435bba611f1c77a59e684ac44c3e852d31beb'), + 'cc.gl.300': ('cc.gl.300-a58a25da.npz', 'a58a25da563958c3ae70fc023c4f25bd2fc8a75d'), + 'cc.gom.300': ('cc.gom.300-ec42b285.npz', 'ec42b285a4cbb43ca76056ea0873f0a3b4c19a2e'), + 'cc.gu.300': ('cc.gu.300-ddfdc7d5.npz', 'ddfdc7d5351cbf95838050af53c76e874a013f3d'), + 'cc.gv.300': ('cc.gv.300-df66ebec.npz', 'df66ebec5580e2e8ce5ed42b0116768ac1e63d43'), + 'cc.he.300': ('cc.he.300-bd197a43.npz', 'bd197a43f2600b73f42480f901f8fafb9056e334'), + 'cc.hi.300': ('cc.hi.300-e8f1a8ee.npz', 'e8f1a8ee11d469ee007ac66f1a6ae2d9cf996fde'), + 'cc.hif.300': ('cc.hif.300-cd787567.npz', 'cd7875675c126764f76394435d114a0405a6341b'), + 'cc.hr.300': ('cc.hr.300-f33745d1.npz', 'f33745d1c8e966932d5034f248bcc22b7f8d2297'), + 'cc.hsb.300': ('cc.hsb.300-2c0e9847.npz', '2c0e9847177614324fd271e9c9fa5524969090e9'), + 'cc.ht.300': ('cc.ht.300-3192d8a6.npz', '3192d8a632af0d617d9a9c9e78f4015d0f594131'), + 'cc.hu.300': ('cc.hu.300-08a106da.npz', '08a106da56e64d2b8db4306cbacc99086b49659d'), + 'cc.hy.300': ('cc.hy.300-935747f2.npz', '935747f2a88dff9edf957723fe4736d6ebcf1d6a'), + 'cc.ia.300': ('cc.ia.300-737f4b78.npz', '737f4b78c4fa857a575ea01e6946217b69616ee9'), + 'cc.id.300': ('cc.id.300-b50ae07a.npz', 'b50ae07a0663023c3c117305c05b09143a167700'), + 'cc.ilo.300': ('cc.ilo.300-02b500a7.npz', '02b500a7e10d206239aa502590ff4768840e29c0'), + 'cc.io.300': ('cc.io.300-aaf228a7.npz', 'aaf228a78ce7dc1138181a32011c4d495e383e81'), + 'cc.is.300': ('cc.is.300-2f612f20.npz', '2f612f20cfdcd68b5eb46dacef4ba30f00069f55'), + 'cc.it.300': ('cc.it.300-5b21ee40.npz', '5b21ee408ab99c35a2bdd25e716975e0b73182ad'), + 'cc.ja.300': ('cc.ja.300-89cf6cb7.npz', '89cf6cb70985ca841246139719028103e7a932f8'), + 'cc.jv.300': ('cc.jv.300-85d4a52b.npz', '85d4a52b83150aa46ea60887d58e12f5b8fbc732'), + 'cc.ka.300': ('cc.ka.300-048778a9.npz', '048778a9ac39f9e4fc2b216f5fe752864c793295'), + 'cc.kk.300': ('cc.kk.300-f29ac700.npz', 'f29ac7000778e5adea0c9aa00f11fdcc47386adf'), + 'cc.km.300': ('cc.km.300-b9a2073f.npz', 'b9a2073f8d325e49934ac919cbe222ac84ef77af'), + 'cc.kn.300': ('cc.kn.300-034e4f17.npz', '034e4f17d08351a890896d9b7d2573a88d0fc230'), + 'cc.ko.300': ('cc.ko.300-28e7ae64.npz', '28e7ae64e994b9989f042323ec6f15f5ad7a53d2'), + 'cc.ku.300': ('cc.ku.300-82496bfd.npz', '82496bfd5e23f697b17ae4f351c1a13b12f482b5'), + 'cc.ky.300': ('cc.ky.300-4efb03dc.npz', '4efb03dc26319fd813edfe7cd1c1f373d134ac97'), + 'cc.la.300': ('cc.la.300-8adf2142.npz', '8adf2142b05825aa040d4b96afee9f08b00b94fe'), + 'cc.lb.300': ('cc.lb.300-8945d3df.npz', '8945d3dfb24fbe5ac9391aa10296f9a18992c380'), + 'cc.li.300': ('cc.li.300-0b6aee43.npz', '0b6aee432ca667db8b06dfb29fd249d00df7a275'), + 'cc.lmo.300': ('cc.lmo.300-a02cf032.npz', 'a02cf032a3af3a035bec24b2b225048bf0f86eb5'), + 'cc.lt.300': ('cc.lt.300-2b682e7d.npz', '2b682e7d30e739c4090fef65593d29f40f1323f9'), + 'cc.lv.300': ('cc.lv.300-c2453825.npz', 'c24538254050ce393b7ed8a018ed27a693c9dfa1'), + 'cc.mai.300': ('cc.mai.300-4bc11fe3.npz', '4bc11fe3470e23c79ad5bc2a16283ede92857f9b'), + 'cc.mg.300': ('cc.mg.300-2b644c0f.npz', '2b644c0fe3d32d89cf4a91a62b30a8cdf84eb69e'), + 'cc.mhr.300': ('cc.mhr.300-f9216c88.npz', 'f9216c883afb62c0fd890f1df1f076fda48d534d'), + 'cc.min.300': ('cc.min.300-0d8c3a77.npz', '0d8c3a7709acb9386c9bfb29294c56923e2fe160'), + 'cc.mk.300': ('cc.mk.300-bf1caa91.npz', 'bf1caa91a2376a6e4cae5576e84c00bd7b4d53ea'), + 'cc.ml.300': ('cc.ml.300-0c3baa74.npz', '0c3baa74f062367e00c2df09446480df4ef45b79'), + 'cc.mn.300': ('cc.mn.300-4ce878ea.npz', '4ce878ea1e15afffefa257374a8e265bc26b9d19'), + 'cc.mr.300': ('cc.mr.300-a9e08f9d.npz', 'a9e08f9d21627dc34318208aebb0b7a8f78aeb47'), + 'cc.mrj.300': ('cc.mrj.300-1ebb04f1.npz', '1ebb04f175c1e115c77167222cbfc434474dbf11'), + 'cc.ms.300': ('cc.ms.300-05216b2f.npz', '05216b2f3f76af940bd343135be243c8ed5f0de5'), + 'cc.mt.300': ('cc.mt.300-2a3ba408.npz', '2a3ba408205f3552f432d97fca3de797aa086c62'), + 'cc.mwl.300': ('cc.mwl.300-e758f2b4.npz', 'e758f2b42482ad9d1caf1d4f46f8873a61f6e0ce'), + 'cc.my.300': ('cc.my.300-443b1674.npz', '443b16746ef8f23d64a397b21b1d0ae5707d1e5c'), + 'cc.myv.300': ('cc.myv.300-67d19cef.npz', '67d19ceffa23cb376c309f9e985222579abb9ac1'), + 'cc.mzn.300': ('cc.mzn.300-ce441f50.npz', 'ce441f50a28f0b1f02b47ee1608297750ef0cabd'), + 'cc.nah.300': ('cc.nah.300-ba4c46c0.npz', 'ba4c46c0089109c65cfc25735c5000845345d56e'), + 'cc.nap.300': ('cc.nap.300-489727c7.npz', '489727c7e8241f98c0c7049387a12a101f625226'), + 'cc.nds.300': ('cc.nds.300-6265356d.npz', '6265356d5822372b15ca5103b5ef43f8b3ff4a1c'), + 'cc.ne.300': ('cc.ne.300-6b66b354.npz', '6b66b3542ec091054c2735cb3358b349c8ca3a87'), + 'cc.new.300': ('cc.new.300-f4747761.npz', 'f4747761d9827340fd8074a08e61769ab3a7cc11'), + 'cc.nl.300': ('cc.nl.300-1867fc6d.npz', '1867fc6d6f466fb0d2f46623530c21f4c149197b'), + 'cc.nn.300': ('cc.nn.300-3ce324ef.npz', '3ce324eff15cd595e81bb1882d23b7948772f400'), + 'cc.no.300': ('cc.no.300-6e39d0d5.npz', '6e39d0d5e205c7106be0a601dbd59455f17c97e1'), + 'cc.nso.300': ('cc.nso.300-90ba6fda.npz', '90ba6fdac1eb6d14d6de69b6a794a2a6194ea5a2'), + 'cc.oc.300': ('cc.oc.300-c66e395b.npz', 'c66e395bce5ad97815b7273556b6225e72b76a0d'), + 'cc.or.300': ('cc.or.300-8b14cde1.npz', '8b14cde1cee9858052430c9ecc6cfa8a16fcab39'), + 'cc.os.300': ('cc.os.300-1a888846.npz', '1a8888460713455b33f9cd2ddd957a1afda6ce45'), + 'cc.pa.300': ('cc.pa.300-185e0f43.npz', '185e0f4361f927fca6c0ff27cf8e1963d627041c'), + 'cc.pam.300': ('cc.pam.300-9ac2a111.npz', '9ac2a1112ffbe96cc9ba226a4cc9331f696db6e9'), + 'cc.pfl.300': ('cc.pfl.300-ca0cddb4.npz', 'ca0cddb47b061380b2522041d9472ca06240b92f'), + 'cc.pl.300': ('cc.pl.300-98c4c23c.npz', '98c4c23c78824a3801d05f9d67a696f9fcc40683'), + 'cc.pms.300': ('cc.pms.300-cbd33047.npz', 'cbd3304720df204b5c8ec576e3abf9306728f5d4'), + 'cc.pnb.300': ('cc.pnb.300-66b69add.npz', '66b69addc40b633af8d06c7d5fd8066ac64c8a2c'), + 'cc.ps.300': ('cc.ps.300-ce11c971.npz', 'ce11c97193ee28cdb53ed72c68b20465a607f957'), + 'cc.pt.300': ('cc.pt.300-985866df.npz', '985866df8e19f52f410e6251b82e3bb4494b0d1d'), + 'cc.qu.300': ('cc.qu.300-0ae2e211.npz', '0ae2e2111ee2e7aab1116b587fe302d86ab3641e'), + 'cc.rm.300': ('cc.rm.300-b2367a7f.npz', 'b2367a7f72e26ab5a844b7d8483a455dc73e1994'), + 'cc.ro.300': ('cc.ro.300-682eda30.npz', '682eda308c2a14041e71d0d0588dd17681294ec5'), + 'cc.ru.300': ('cc.ru.300-0f9c3b90.npz', '0f9c3b905669e0f2a32196bc2a391fe2ef40f5a4'), + 'cc.sa.300': ('cc.sa.300-1d0f5bfd.npz', '1d0f5bfdc3118f9ad77e704a7d808c9231cb13eb'), + 'cc.sah.300': ('cc.sah.300-f8532800.npz', 'f85328005fd0d2f0f09cbdc24760b20180b537c4'), + 'cc.sc.300': ('cc.sc.300-dcb4fb26.npz', 'dcb4fb26874067e449f609f9b2cd138b60606d33'), + 'cc.scn.300': ('cc.scn.300-e72c5de7.npz', 'e72c5de7693efab12d6f9a6508c0a5fa8845bd04'), + 'cc.sco.300': ('cc.sco.300-13d0662d.npz', '13d0662d930c18bf0c8dff7efc01219712d38351'), + 'cc.sd.300': ('cc.sd.300-baf6b5af.npz', 'baf6b5afa56fd93d78175307b3d84dbad85d2710'), + 'cc.sh.300': ('cc.sh.300-b6958a18.npz', 'b6958a18612551a7b325022fcbbdfa2637fb4411'), + 'cc.si.300': ('cc.si.300-c846206f.npz', 'c846206fd77cdb5619752e3cb448dbcc0dac61b6'), + 'cc.sk.300': ('cc.sk.300-4fc2be73.npz', '4fc2be73972027f55f88e288dc3c0ecb2a6fba42'), + 'cc.sl.300': ('cc.sl.300-6b735538.npz', '6b7355380cee5320c42b1aa8d41226a562bc6407'), + 'cc.so.300': ('cc.so.300-f09c2019.npz', 'f09c2019099cd0ae488810a26ceea3437c450711'), + 'cc.sq.300': ('cc.sq.300-c55b576c.npz', 'c55b576c14d06489d6105a7a1d9126e181877030'), + 'cc.sr.300': ('cc.sr.300-887cf83d.npz', '887cf83d9525573407f4206b268e6e3c31266403'), + 'cc.su.300': ('cc.su.300-224e9974.npz', '224e99745371a890c6c944aae94d6c5d637876b5'), + 'cc.sv.300': ('cc.sv.300-c5266ab8.npz', 'c5266ab8ee8a1c093b9437c6b598659c72dd1f7e'), + 'cc.sw.300': ('cc.sw.300-829d1ca0.npz', '829d1ca09c78d52280497fd257883292b53e8bc7'), + 'cc.ta.300': ('cc.ta.300-f8e9bee4.npz', 'f8e9bee4ea31e6952ff582855200ade217c11d9b'), + 'cc.te.300': ('cc.te.300-19b7470c.npz', '19b7470cd59e6a67986267cf9171c6d02340c1d1'), + 'cc.tg.300': ('cc.tg.300-069920da.npz', '069920da67fcaba58e2bd5f4bd505e3cd0b325ab'), + 'cc.th.300': ('cc.th.300-17f28dd9.npz', '17f28dd9fabe987d6017ed4ed0897e3f01ab18a2'), + 'cc.tk.300': ('cc.tk.300-ac4be8fe.npz', 'ac4be8fe39901913ac0ae5b9ec023ffdb2c4ccdd'), + 'cc.tl.300': ('cc.tl.300-919ed791.npz', '919ed791c7cc2f7f1a214611fa12d32187f186c5'), + 'cc.tr.300': ('cc.tr.300-cbd537c6.npz', 'cbd537c606cd41c6a90e2de71ad7dc2902e63363'), + 'cc.tt.300': ('cc.tt.300-c38d9317.npz', 'c38d93176d6fa35658352bd74a0fb81e0edf3d3a'), + 'cc.ug.300': ('cc.ug.300-d99daa00.npz', 'd99daa007459f8129508fdd3ffcab135083309d5'), + 'cc.uk.300': ('cc.uk.300-a20e9dab.npz', 'a20e9dab63d727dc4c03a99caf66c8a194a2f96e'), + 'cc.ur.300': ('cc.ur.300-57ca0636.npz', '57ca06364b7413195d75adda61c00f972ab1a43e'), + 'cc.uz.300': ('cc.uz.300-0f2eef78.npz', '0f2eef78cc68f14a89b6b6a2d6ac4b6613d080e9'), + 'cc.vec.300': ('cc.vec.300-c9bfa76a.npz', 'c9bfa76a5dc2787923f26ea314f3a5a957fcd00c'), + 'cc.vi.300': ('cc.vi.300-44d740f4.npz', '44d740f4044cb2cd8d442327c17b99c359a8ca43'), + 'cc.vls.300': ('cc.vls.300-08e7eaba.npz', '08e7eaba113d6084d1a25958165428486caf7d19'), + 'cc.vo.300': ('cc.vo.300-792d3a79.npz', '792d3a7983bb4560d05932d1a2de35203c6b9479'), + 'cc.wa.300': ('cc.wa.300-bfc87d4c.npz', 'bfc87d4c6738770afd3b9f0f5e7adccd79131a03'), + 'cc.war.300': ('cc.war.300-d8a9082f.npz', 'd8a9082f98ebd8b312cafbc90c50a8ab421dc06b'), + 'cc.xmf.300': ('cc.xmf.300-8bc1fdf1.npz', '8bc1fdf1db5f2716a2d6542c3ae6d5a4abbd2506'), + 'cc.yi.300': ('cc.yi.300-33533193.npz', '33533193c9b710d2ca283e8978b83e3842ea8f5d'), + 'cc.yo.300': ('cc.yo.300-9dc5edde.npz', '9dc5eddeee3354f3f587ecdca73caf01080c399d'), + 'cc.zea.300': ('cc.zea.300-c0a4fb02.npz', 'c0a4fb025aab6774db52d6a845108f5cf8738508'), + 'cc.zh.300': ('cc.zh.300-355cfcaf.npz', '355cfcafe71536226a1737aafb4530c9ba4fd09f'), +} + +FAST_TEXT_BIN_SHA1 = { + 'wiki-news-300d-1M-subword': ('wiki-news-300d-1M-subword-c8853bda.bin', + 'c8853bdae00318097b6337c4631d342879d6b18c'), + 'crawl-300d-2M-subword': ('crawl-300d-2M-subword-e6b07293f.bin', + 'e6b07293f7b0095e3c72c2a12bc09464b69444b0'), + 'cc.af.300': ('cc.af.300-33115ff8.bin', '33115ff8e4c8f439757c819399177f1f58f07f12'), + 'cc.als.300': ('cc.als.300-d6579933.bin', 'd65799331a03895d68a3fbe7611b181d7e7cc916'), + 'cc.am.300': ('cc.am.300-999b3e95.bin', '999b3e95a2c490d7fcab2a6e08074746303d3c17'), + 'cc.an.300': ('cc.an.300-65f5c5b8.bin', '65f5c5b88d1c8181ce60aff4275d14e8a7c4ae53'), + 'cc.ar.300': ('cc.ar.300-44333e53.bin', '44333e5344fe66e78322b05bf53d6047925097ee'), + 'cc.arz.300': ('cc.arz.300-430f08ff.bin', '430f08ffc7f9391ed09c781fabc30baf568b8d47'), + 'cc.as.300': ('cc.as.300-e85d59f6.bin', 'e85d59f6fe2e908b3caab3a8bc9bfd23d6885eb2'), + 'cc.ast.300': ('cc.ast.300-4bdc4520.bin', '4bdc452067cc838e49a1544902941470ff685b12'), + 'cc.az.300': ('cc.az.300-10a62cca.bin', '10a62cca45f99e977accd28912ee18c74332080e'), + 'cc.azb.300': ('cc.azb.300-5f148a4f.bin', '5f148a4f2d8feecb217da604ca02fabd0fc112fd'), + 'cc.ba.300': ('cc.ba.300-9310a2c1.bin', '9310a2c11fda72ec87493bd4d65330537911b09a'), + 'cc.bar.300': ('cc.bar.300-35ab084b.bin', '35ab084b3e3972419534bd60197a564c27ca90e0'), + 'cc.bcl.300': ('cc.bcl.300-cf0fb2f8.bin', 'cf0fb2f8f1cbd04ad12bebb9846d7636333de556'), + 'cc.be.300': ('cc.be.300-ac4ef017.bin', 'ac4ef017d975f0649c294f57fb83a3bddf55e137'), + 'cc.bg.300': ('cc.bg.300-a5a375ef.bin', 'a5a375ef5f670c0a0926aa1a8025df3190cfc2d0'), + 'cc.bh.300': ('cc.bh.300-776d7f4d.bin', '776d7f4d102a574cffba45a43b1913b2e23c6d94'), + 'cc.bn.300': ('cc.bn.300-e327bd67.bin', 'e327bd678adbda1b4ace3e020a0329f6146d9f6f'), + 'cc.bo.300': ('cc.bo.300-33174d4f.bin', '33174d4f9ffa87f71c401260a5a6008cdaac61cb'), + 'cc.bpy.300': ('cc.bpy.300-4f8f3598.bin', '4f8f35987bc35b30d11b189f7066c41510331d4e'), + 'cc.br.300': ('cc.br.300-7a48b869.bin', '7a48b869104057ba097c210d847de2f76ec748fb'), + 'cc.bs.300': ('cc.bs.300-8a237bd9.bin', '8a237bd9a530f8feee7feaab583d89028e26be8d'), + 'cc.ca.300': ('cc.ca.300-db0f7120.bin', 'db0f7120e03604c8dcedb57582cee4f7d5d9c90c'), + 'cc.ce.300': ('cc.ce.300-5f8cebac.bin', '5f8cebac11c1fbbb23540655f83d8afe1b1a7760'), + 'cc.ceb.300': ('cc.ceb.300-89a4764f.bin', '89a4764f1ff3dc073a76fda3290f04fe5adf83ac'), + 'cc.ckb.300': ('cc.ckb.300-990d1cef.bin', '990d1cef7d7d36c12f7b9afe2381251169417499'), + 'cc.co.300': ('cc.co.300-836763a0.bin', '836763a0b4e40facde79983f5156d8c6a875dffb'), + 'cc.cs.300': ('cc.cs.300-884c693a.bin', '884c693a557633cd711bcd2888a4088bdc74723e'), + 'cc.cv.300': ('cc.cv.300-0dfbf016.bin', '0dfbf0168205c4ca02fd7f249c8a4f7caec6ea2a'), + 'cc.cy.300': ('cc.cy.300-cb4b9534.bin', 'cb4b953463170fa209c2ce9991bea3a07575e9de'), + 'cc.da.300': ('cc.da.300-6b65b204.bin', '6b65b204ff034184c785678655ffc9fa7b642b34'), + 'cc.de.300': ('cc.de.300-fc6e4385.bin', 'fc6e438502a3b8aadf119d117f85120a3cc28bae'), + 'cc.diq.300': ('cc.diq.300-490f18c4.bin', '490f18c4a8963ca511a8d064da68eddd05f44e7f'), + 'cc.dv.300': ('cc.dv.300-4ffe23d6.bin', '4ffe23d6bd18a1ba6273601e543932abf69d4651'), + 'cc.el.300': ('cc.el.300-7a89986b.bin', '7a89986b681f178b92f3af015aaa4900ba6dd6a6'), + 'cc.eml.300': ('cc.eml.300-8a6221bf.bin', '8a6221bfd1d98e1d14c89db54436d17f505b065d'), + 'cc.en.300': ('cc.en.300-53588c22.bin', '53588c22cac7f8bf504169f671206b60da21d9b2'), + 'cc.eo.300': ('cc.eo.300-5d9aeebb.bin', '5d9aeebb2c19807839ef68b5c5f7897d1e8ddd3a'), + 'cc.es.300': ('cc.es.300-e9f8c041.bin', 'e9f8c04142005cadae449f016e1bebf7ae254307'), + 'cc.et.300': ('cc.et.300-3f4391ed.bin', '3f4391edec8cf6aafcf9857bf465439f00b84a1a'), + 'cc.eu.300': ('cc.eu.300-142f1337.bin', '142f1337d51569f9254a50bdcfe125c028f28bb5'), + 'cc.fa.300': ('cc.fa.300-3d6ad675.bin', '3d6ad6750c27ad94e3498314a985d81bf20130f0'), + 'cc.fi.300': ('cc.fi.300-edbd8e6e.bin', 'edbd8e6e56ab951429911ce7a16d51260773e81c'), + 'cc.fr.300': ('cc.fr.300-35ea5d6b.bin', '35ea5d6b86011a5b85d0671d133acf8aded5fc54'), + 'cc.frr.300': ('cc.frr.300-d87f646a.bin', 'd87f646a6c3559263217941255856da48d159e4d'), + 'cc.fy.300': ('cc.fy.300-fd96db60.bin', 'fd96db60715adb8aaddc85123e14b3d081ef0ad3'), + 'cc.ga.300': ('cc.ga.300-520acbd7.bin', '520acbd7771703194c8e99b28094ea54fa86a3c7'), + 'cc.gd.300': ('cc.gd.300-781ceb1c.bin', '781ceb1cceaa107adea7c0434677c74906c05e4c'), + 'cc.gl.300': ('cc.gl.300-b71ae11d.bin', 'b71ae11d25dfecfa3dfe83e49b24a85037e83b43'), + 'cc.gom.300': ('cc.gom.300-65ba9b91.bin', '65ba9b9172c78600b5fcccd7514e6f5cb6b34750'), + 'cc.gu.300': ('cc.gu.300-d717959d.bin', 'd717959de35ffdc4be47ea282181f3118fa6af05'), + 'cc.gv.300': ('cc.gv.300-15fb06cb.bin', '15fb06cbfed61516a6014cb04f45e3876b154ae2'), + 'cc.he.300': ('cc.he.300-743fbd32.bin', '743fbd320942c5c48bb4347beb9f24aa5d3b46f4'), + 'cc.hi.300': ('cc.hi.300-75e919aa.bin', '75e919aa43832d6a7f08b8e05d9ddff562ead072'), + 'cc.hif.300': ('cc.hif.300-0c25528b.bin', '0c25528b1f156a61205b96817b0fa9995fa5a2b3'), + 'cc.hr.300': ('cc.hr.300-ab167ebb.bin', 'ab167ebb9a5cdd999500fd1beac2229796923795'), + 'cc.hsb.300': ('cc.hsb.300-62fb0705.bin', '62fb07054f659ce5d9f2e2dda67133649b432611'), + 'cc.ht.300': ('cc.ht.300-292d0eeb.bin', '292d0eebf256811b9cc7d6cd5dccf039d5083cf9'), + 'cc.hu.300': ('cc.hu.300-9d660157.bin', '9d660157bc371de60ead317cce852d506544f0e2'), + 'cc.hy.300': ('cc.hy.300-fa5ac6a1.bin', 'fa5ac6a1eb9e1e4e047bbf8343ea042ded75dd40'), + 'cc.ia.300': ('cc.ia.300-a01758dc.bin', 'a01758dcab7138e5f67e9fd58c23b18e88142b4b'), + 'cc.id.300': ('cc.id.300-609f02da.bin', '609f02daa0c13e544c52314452bf077f6f769019'), + 'cc.ilo.300': ('cc.ilo.300-199068ee.bin', '199068ee56ce25ac16b6ba70c3ae337f8eed9d96'), + 'cc.io.300': ('cc.io.300-80565e7e.bin', '80565e7e7e71b28e247ebb85da1e767cc62e7c38'), + 'cc.is.300': ('cc.is.300-b228019a.bin', 'b228019ac716a60a4da057e787a64b0b53c1a1ec'), + 'cc.it.300': ('cc.it.300-411f0ed7.bin', '411f0ed74448758f25c66699eec582ff5f9d2cc2'), + 'cc.ja.300': ('cc.ja.300-806f7e68.bin', '806f7e68c0c832afb5d70f0c072189fbb4d44108'), + 'cc.jv.300': ('cc.jv.300-af9d3f82.bin', 'af9d3f823a4d87c0dcf85d8b7ba753e1145cc4f9'), + 'cc.ka.300': ('cc.ka.300-7189ff30.bin', '7189ff30be4d8e45b394149b1fd9f3db794e3b42'), + 'cc.kk.300': ('cc.kk.300-621de409.bin', '621de40935e740a063a945402111b3bb7c619c08'), + 'cc.km.300': ('cc.km.300-6410e183.bin', '6410e1832df131f309337416e4ed07a19bd22d9f'), + 'cc.kn.300': ('cc.kn.300-dfcf2d68.bin', 'dfcf2d68bf43a1dcbb5d01e5076db16132e27d8f'), + 'cc.ko.300': ('cc.ko.300-b7990877.bin', 'b7990877d498f084adf300f63b53565bc868b520'), + 'cc.ku.300': ('cc.ku.300-eb62ada0.bin', 'eb62ada0e5bf9cf0535f1fe80d47b136665a8e3a'), + 'cc.ky.300': ('cc.ky.300-01ae0d23.bin', '01ae0d2346e12e30b7ac0422cfd1f3ad6cb701da'), + 'cc.la.300': ('cc.la.300-08e402f3.bin', '08e402f3d0d10da67444890fe15ca09e563f11a6'), + 'cc.lb.300': ('cc.lb.300-c3b5e0a3.bin', 'c3b5e0a3ee790f21f12c17f6302e73ba0ee644f7'), + 'cc.li.300': ('cc.li.300-b7c9c792.bin', 'b7c9c79273458c4110786f4a89c1fa0ec9bcaa80'), + 'cc.lmo.300': ('cc.lmo.300-b7da2fe8.bin', 'b7da2fe85b58341e63379a0e22ccb84a7d2466ba'), + 'cc.lt.300': ('cc.lt.300-73413b3f.bin', '73413b3f0072abf2eb7666795d77e9f0e85b327a'), + 'cc.lv.300': ('cc.lv.300-725f5e2a.bin', '725f5e2a1e66173d73cbf103fceb5f86844e2278'), + 'cc.mai.300': ('cc.mai.300-3de31332.bin', '3de31332d7afde6a93e3d05c05212d27ea538d3d'), + 'cc.mg.300': ('cc.mg.300-0c7757e2.bin', '0c7757e2b3417cea49c679291e6e7bfe8f3653d5'), + 'cc.mhr.300': ('cc.mhr.300-1013afe9.bin', '1013afe9cd1428e5915feebc3a2b189d8d77f9d0'), + 'cc.min.300': ('cc.min.300-1d684a9b.bin', '1d684a9bead229e94c9b538fc9aebb1235c6e68f'), + 'cc.mk.300': ('cc.mk.300-f9ba6f8e.bin', 'f9ba6f8eddb4e577bf44475f831984c70e371719'), + 'cc.ml.300': ('cc.ml.300-bc6a2b1c.bin', 'bc6a2b1c2743bc2749fc8072a8276e2beb3f9a22'), + 'cc.mn.300': ('cc.mn.300-7637ae47.bin', '7637ae47bb925fa77fe82dbe2d20eb3c56b517ee'), + 'cc.mr.300': ('cc.mr.300-3e5eb45e.bin', '3e5eb45e7475dd3115ef5cc91e7b0257989fde18'), + 'cc.mrj.300': ('cc.mrj.300-1593ea78.bin', '1593ea786cfcf70ba4feffda09de2dc2f2bcf80d'), + 'cc.ms.300': ('cc.ms.300-a743adf6.bin', 'a743adf6420ad8a7aa146d9218f14d1bdf5c3285'), + 'cc.mt.300': ('cc.mt.300-87c3b72c.bin', '87c3b72cfcd0383d7edb8f4075106f51d5e6b03c'), + 'cc.mwl.300': ('cc.mwl.300-5d3cc773.bin', '5d3cc7739062030b9733f5dcbd64fdb1f3d397ec'), + 'cc.my.300': ('cc.my.300-b84b8c93.bin', 'b84b8c93cbb60178ada74e23caf85cf443208739'), + 'cc.myv.300': ('cc.myv.300-ccf32608.bin', 'ccf32608c23258ff0b381b07ee6b4a1374fac29f'), + 'cc.mzn.300': ('cc.mzn.300-00c010f4.bin', '00c010f4c43e3ddb4fcdc29b4a946dafd5196151'), + 'cc.nah.300': ('cc.nah.300-052fcbbc.bin', '052fcbbc5fd6891ef38250b3731987e480ba072d'), + 'cc.nap.300': ('cc.nap.300-aa45c158.bin', 'aa45c158119e095eb186098e47b8037dcca0c847'), + 'cc.nds.300': ('cc.nds.300-c934b13a.bin', 'c934b13ab1a06ea288461b12b9065a13d6e6438e'), + 'cc.ne.300': ('cc.ne.300-7f70c5b9.bin', '7f70c5b9b7b9f598c041c7d8454d1d12e41005df'), + 'cc.new.300': ('cc.new.300-4f8f8762.bin', '4f8f876293ff7096f4fe0ed13148dd116bf57ce5'), + 'cc.nl.300': ('cc.nl.300-fb2cb6e7.bin', 'fb2cb6e75fff23b26d220395b6e2869be083722e'), + 'cc.nn.300': ('cc.nn.300-085e9ef7.bin', '085e9ef79e6bb147d53081a407d598658562dab1'), + 'cc.no.300': ('cc.no.300-d3028680.bin', 'd3028680f0e5458d2272ea14ee56a10820e4e406'), + 'cc.nso.300': ('cc.nso.300-6cc24a78.bin', '6cc24a78780f4da1a18d5da310217dc21acc1977'), + 'cc.oc.300': ('cc.oc.300-8cee765a.bin', '8cee765a77d21044792895b4fe32d56e8287c200'), + 'cc.or.300': ('cc.or.300-64fb17ff.bin', '64fb17ffcd76db9836be3a8c553b0c973232b4fa'), + 'cc.os.300': ('cc.os.300-e5c880f6.bin', 'e5c880f6499b1ea9f9d554d0ea356f914e4c4657'), + 'cc.pa.300': ('cc.pa.300-3673544d.bin', '3673544dea157cfcae180fe5a444457c7bed462e'), + 'cc.pam.300': ('cc.pam.300-1e894611.bin', '1e894611ec170839348af1f767164230f7225c94'), + 'cc.pfl.300': ('cc.pfl.300-ac9babfd.bin', 'ac9babfd17941341fdd06f9dc23aeb3dd315952a'), + 'cc.pl.300': ('cc.pl.300-ea55590b.bin', 'ea55590b385ca9c8ff409a807a4635624c73693e'), + 'cc.pms.300': ('cc.pms.300-523564e9.bin', '523564e993e7925c706c039d444c4048fa19658e'), + 'cc.pnb.300': ('cc.pnb.300-d09b6003.bin', 'd09b6003f0852f698f9589523e717c3be9b0e230'), + 'cc.ps.300': ('cc.ps.300-0cb19e87.bin', '0cb19e874d83664980312fa659f1f7269f1459e4'), + 'cc.pt.300': ('cc.pt.300-e69e6c5b.bin', 'e69e6c5b1ba0e802755c227d2161106caabb6b3d'), + 'cc.qu.300': ('cc.qu.300-f99c269d.bin', 'f99c269de57ff62ec2a580307e239d80d6c0ac1a'), + 'cc.rm.300': ('cc.rm.300-20d2cdcd.bin', '20d2cdcd8fbb49e000eb588e969046f4a4058c9b'), + 'cc.ro.300': ('cc.ro.300-30900544.bin', '309005440433a108017444689d8605709c5bd0ee'), + 'cc.ru.300': ('cc.ru.300-fd892a10.bin', 'fd892a10914cde02c4f1348f9b03d25d45e0d2d3'), + 'cc.sa.300': ('cc.sa.300-91f3b393.bin', '91f3b3931e2b6f4ab6fb092032df7218d400d330'), + 'cc.sah.300': ('cc.sah.300-ea2c7d00.bin', 'ea2c7d00ccfd6b02a928d1f9326986c64cc6e558'), + 'cc.sc.300': ('cc.sc.300-6879c580.bin', '6879c58057dd2eeff50ef158fe53a26ae9050070'), + 'cc.scn.300': ('cc.scn.300-4fb8dad7.bin', '4fb8dad71966dffe5c70efe330a1f881df6227dd'), + 'cc.sco.300': ('cc.sco.300-191f6929.bin', '191f6929dcf10e7d6198529108156b9dd48b23be'), + 'cc.sd.300': ('cc.sd.300-de045844.bin', 'de045844a43e931db0183bdce996110a8593aa63'), + 'cc.sh.300': ('cc.sh.300-529f81f1.bin', '529f81f1d5fec8c2208e976a11406c93de3e8920'), + 'cc.si.300': ('cc.si.300-c66d404a.bin', 'c66d404a889326a06e20c3cfe7eea80d866a1d13'), + 'cc.sk.300': ('cc.sk.300-2ed40f6a.bin', '2ed40f6aa0bcd369d1450bcea9b0cfab16e6d6d0'), + 'cc.sl.300': ('cc.sl.300-da689ced.bin', 'da689cedcfef7914985e6085df7d0c4ef68da657'), + 'cc.so.300': ('cc.so.300-07b7260a.bin', '07b7260aff73dc829cb49eb2bd72584122337b7f'), + 'cc.sq.300': ('cc.sq.300-440b0444.bin', '440b04440edb4d26751c4b13010d0335972808d8'), + 'cc.sr.300': ('cc.sr.300-23f9d7d9.bin', '23f9d7d93f7f4d0bfb73dea047eae5f4d67aef23'), + 'cc.su.300': ('cc.su.300-5d7d8243.bin', '5d7d82438fb71594a31ea46c0b5580ac41b37ece'), + 'cc.sv.300': ('cc.sv.300-6fafdc44.bin', '6fafdc4452a30350ded92e9309bea658f2a31279'), + 'cc.sw.300': ('cc.sw.300-73909439.bin', '7390943941f25d75fe2bdd9894b2d49f32b1a74b'), + 'cc.ta.300': ('cc.ta.300-2e0386c4.bin', '2e0386c410927b53eafbc63b07ab45ff27d6dac9'), + 'cc.te.300': ('cc.te.300-e77f5ea9.bin', 'e77f5ea9e2e726607bdfc6634cf5eac0b9f7d5b5'), + 'cc.tg.300': ('cc.tg.300-ba451c18.bin', 'ba451c18ba027b5e12fb1d129aefe1dc8e10b451'), + 'cc.th.300': ('cc.th.300-5b8a7299.bin', '5b8a729925df8059de767a393e1c9cfef8d94a41'), + 'cc.tk.300': ('cc.tk.300-3f602443.bin', '3f602443ebee0d49cc181f4f21f21bd9590a31d5'), + 'cc.tl.300': ('cc.tl.300-afee5714.bin', 'afee5714639cfe4d145bf1ef6294da065bc65b37'), + 'cc.tr.300': ('cc.tr.300-5ac2d698.bin', '5ac2d698881a330dfeb554c43cd3737605f04e66'), + 'cc.tt.300': ('cc.tt.300-8b467e9d.bin', '8b467e9d9834df62075829c00a007618280a3980'), + 'cc.ug.300': ('cc.ug.300-8dd88596.bin', '8dd88596669dba3822a701baab0fabb5c97ed7cb'), + 'cc.uk.300': ('cc.uk.300-89630e2d.bin', '89630e2d47dac2e0c7a2036e4a6021b5323dd5aa'), + 'cc.ur.300': ('cc.ur.300-997b377c.bin', '997b377c148c50f9e39ccf085e2316e23da54228'), + 'cc.uz.300': ('cc.uz.300-7f1e67da.bin', '7f1e67dae218977ffc2da9c7160a4ac268fa4199'), + 'cc.vec.300': ('cc.vec.300-21e1d068.bin', '21e1d068086afcaecc3fa585b72697eb5ca3aeee'), + 'cc.vi.300': ('cc.vi.300-3c52cba2.bin', '3c52cba2d0c5fbf781eef4068e31f6c53ba7ed8f'), + 'cc.vls.300': ('cc.vls.300-6ffd43bb.bin', '6ffd43bb11eceec01fd5f0d6fefc96f9c14a17f1'), + 'cc.vo.300': ('cc.vo.300-70751ce3.bin', '70751ce3c3867fa9ddc5fd7e435fcc9f1334e796'), + 'cc.wa.300': ('cc.wa.300-eaca4696.bin', 'eaca46968edb721849fc99e15647f9f0f2df3eca'), + 'cc.war.300': ('cc.war.300-a89f1676.bin', 'a89f1676dba8d7beae42828c71b43f749da7cbfd'), + 'cc.xmf.300': ('cc.xmf.300-bb054a64.bin', 'bb054a64c6e287173224fbc7bf19f7a365a5866f'), + 'cc.yi.300': ('cc.yi.300-38a25707.bin', '38a257077225bf544e5ac95d36125ccd26f1e45a'), + 'cc.yo.300': ('cc.yo.300-cecf6563.bin', 'cecf6563658de3082db9a197e8e5382d4a9c5b25'), + 'cc.zea.300': ('cc.zea.300-ac403268.bin', 'ac4032686216c76784c743c33e703109584d0a3f'), + 'cc.zh.300': ('cc.zh.300-bbab54e0.bin', 'bbab54e09aa1de478a02de1c2c7c71c3a8d1f4a9'), + 'wiki.aa': ('wiki.aa-19450d26.bin', '19450d26509c90a0a6f00114fb8d25f58e108d90'), + 'wiki.ab': ('wiki.ab-4c3cc463.bin', '4c3cc4637cf8c75abc4377c40e5238d15e506264'), + 'wiki.ace': ('wiki.ace-1d107b15.bin', '1d107b158e94cff010021775ad4d440035d375c0'), + 'wiki.ady': ('wiki.ady-568aebce.bin', '568aebce6b718077f8faa8441a7bd6ffdbbae821'), + 'wiki.af': ('wiki.af-e4c40da8.bin', 'e4c40da87e6628c32d82a736d0178298b8dc612d'), + 'wiki.ak': ('wiki.ak-c1c81013.bin', 'c1c81013d6ec19ad97fbf145df71f72341ff95f0'), + 'wiki.als': ('wiki.als-a77c3e58.bin', 'a77c3e58b0e13eb2a43dc43b4a519dc9f1a10fe8'), + 'wiki.am': ('wiki.am-18ab66cf.bin', '18ab66cfc9d1ae84679acc2edb4696ac79b77aec'), + 'wiki.an': ('wiki.an-ad4f3886.bin', 'ad4f3886c2f4794349bf45ddc7af700ce3941aa6'), + 'wiki.ang': ('wiki.ang-1053783b.bin', '1053783bb06e698f1705bd7244cad81134dae6ff'), + 'wiki.ar': ('wiki.ar-48738c73.bin', '48738c73a1438a8b615335deff77864556e783eb'), + 'wiki.arc': ('wiki.arc-9e0740db.bin', '9e0740dbf20d39fccd4b7bed2be329b06cf7270b'), + 'wiki.arz': ('wiki.arz-32384b81.bin', '32384b8102596a459436caf02031d29a2fb31b2a'), + 'wiki.as': ('wiki.as-50765f8c.bin', '50765f8c2a6bb827b843165aa6ab4b25ffb340c2'), + 'wiki.ast': ('wiki.ast-0b3e9cd0.bin', '0b3e9cd0c14ad6e72847f549cf4e2d684c23e2fc'), + 'wiki.av': ('wiki.av-26427883.bin', '2642788376154d8fc10e0f50eefd6ebd956d8211'), + 'wiki.ay': ('wiki.ay-3926337d.bin', '3926337dc3162e5eb369d2c4141aa19d43de7e77'), + 'wiki.az': ('wiki.az-7e1aa3b5.bin', '7e1aa3b54e75aa381104a36d32e9b2b943c58416'), + 'wiki.azb': ('wiki.azb-2a3112fe.bin', '2a3112fedf2eeee37d929f2e24e6ce93651c1e58'), + 'wiki.ba': ('wiki.ba-ddb26431.bin', 'ddb2643124e4d67da1099a6bb7c34ccda9c4d54b'), + 'wiki.bar': ('wiki.bar-76a67a05.bin', '76a67a057330963a89845248718e9c0cf43042e6'), + 'wiki.bat_smg': ('wiki.bat_smg-e6bb57b0.bin', 'e6bb57b0c61e2e4486b29c8437f073893111bc12'), + 'wiki.bcl': ('wiki.bcl-f9b50b40.bin', 'f9b50b40c7398b441db0d023b9546c5623ca81e6'), + 'wiki.be': ('wiki.be-9de13c85.bin', '9de13c852c5e283ac3362d3debc9343dcaf851ed'), + 'wiki.bg': ('wiki.bg-fa0e36e7.bin', 'fa0e36e702301e091dab5b50353f5b93ec99eda1'), + 'wiki.bh': ('wiki.bh-4ea0c4ce.bin', '4ea0c4ce9fb9e7fcec9b102a1d21e51cbea54860'), + 'wiki.bi': ('wiki.bi-d756a260.bin', 'd756a26035e8a4f4c556bfd40470a3504bf47380'), + 'wiki.bjn': ('wiki.bjn-a6bda749.bin', 'a6bda7490f87734de0e9cb0996e6edd68b32d097'), + 'wiki.bm': ('wiki.bm-ddae0aee.bin', 'ddae0aee51f99812ec418ca32ebc7bcb3d7d7afe'), + 'wiki.bn': ('wiki.bn-84a5663f.bin', '84a5663fdeb61edf7a2076b313970d5cef5a1e58'), + 'wiki.bo': ('wiki.bo-ee189a77.bin', 'ee189a7723d6d89088299b2f7b2b7c5c81c1a83c'), + 'wiki.bpy': ('wiki.bpy-7c9cab8b.bin', '7c9cab8bc9317b2c9b782f3684956f62ce5253c9'), + 'wiki.br': ('wiki.br-750d7016.bin', '750d7016b22ac7293214cb26e25562b1fc333165'), + 'wiki.bs': ('wiki.bs-0a2fdd98.bin', '0a2fdd987687a7485f4a1030d013f7b707af2d56'), + 'wiki.bug': ('wiki.bug-3937c2af.bin', '3937c2afd0b6bea60d31b600858002657319dfff'), + 'wiki.bxr': ('wiki.bxr-2b522edb.bin', '2b522edb5fef85bf5c2818f84cc1ffd13e4ddf95'), + 'wiki.ca': ('wiki.ca-fc711e4b.bin', 'fc711e4b5b67de1ebf6cd1f5f99bb09822953781'), + 'wiki.cbk_zam': ('wiki.cbk_zam-b7832a19.bin', 'b7832a1932382f1ce89562042009055a914f7c1e'), + 'wiki.cdo': ('wiki.cdo-e75244c2.bin', 'e75244c2d992bd14c0900546b950a707ee99c79a'), + 'wiki.ce': ('wiki.ce-06dcd3ba.bin', '06dcd3bab08d2caabe5b8e5b7e7fcce233858a43'), + 'wiki.ceb': ('wiki.ceb-35c5cd0f.bin', '35c5cd0f8aeef5a78a9113d0ef856e5470eb6400'), + 'wiki.ch': ('wiki.ch-9bfefcab.bin', '9bfefcab247cef190bbcc711e2982c033df8faab'), + 'wiki.cho': ('wiki.cho-7087a54a.bin', '7087a54a087863d5d7058d35478740cd3fdd716d'), + 'wiki.chr': ('wiki.chr-5e93d639.bin', '5e93d6398b44467f8a57593656127cc3a601d361'), + 'wiki.chy': ('wiki.chy-f119f436.bin', 'f119f43617ba2adbf6f7b10f573c3d6c8daa63a9'), + 'wiki.ckb': ('wiki.ckb-49d6d997.bin', '49d6d99772cac2e25beeb02d9f8f055739d1f369'), + 'wiki.co': ('wiki.co-dd8e6c6c.bin', 'dd8e6c6ce7d740c9a87f10a90b597df2d1a0d883'), + 'wiki.cr': ('wiki.cr-a60b68fc.bin', 'a60b68fc163412717af9d50ce6f8cce90de6089a'), + 'wiki.crh': ('wiki.crh-ae73e838.bin', 'ae73e83881604df8dfeb7ae558be351a57051080'), + 'wiki.cs': ('wiki.cs-a41ff81a.bin', 'a41ff81af6ef6ff5f692d9719c90fac2261b7c21'), + 'wiki.csb': ('wiki.csb-13121dd7.bin', '13121dd7558b3ea692d73c964bc721db6ccb8d9b'), + 'wiki.cu': ('wiki.cu-968dea66.bin', '968dea66e5856289724c4d1c8290c1012ea97df3'), + 'wiki.cv': ('wiki.cv-a87c66dc.bin', 'a87c66dc67a57af38333aeb375dd33e4c42f327f'), + 'wiki.cy': ('wiki.cy-4cc3571e.bin', '4cc3571ed974dd877daa3b5ffaf486725d4436a0'), + 'wiki.da': ('wiki.da-53f0da01.bin', '53f0da01b102ff17499678e0a3876146365b5de7'), + 'wiki.de': ('wiki.de-2da44d3d.bin', '2da44d3d5ac758a7c1a169f66db4953a020b1df4'), + 'wiki.diq': ('wiki.diq-f31f5534.bin', 'f31f5534d63a6adaf8fce1426fd2b8efa5dbb88e'), + 'wiki.dsb': ('wiki.dsb-1b26e0af.bin', '1b26e0af41d67a4a8a0d4e2e97ff7d8f958daaac'), + 'wiki.dv': ('wiki.dv-32a8ebf5.bin', '32a8ebf59405ad8f9919b18e27f2a2a79bdd3f3f'), + 'wiki.dz': ('wiki.dz-594a371f.bin', '594a371f17bec9ab45514bf5cf26252be7bb8396'), + 'wiki.ee': ('wiki.ee-e0003d72.bin', 'e0003d7287640101f6b9ecb69452cb47afa0d438'), + 'wiki.el': ('wiki.el-9c824bd0.bin', '9c824bd0e0e6888e2bbc065a27e2b45ec4164e8b'), + 'wiki.eml': ('wiki.eml-84490c6b.bin', '84490c6bedef5204c703b5ac5c9ee008147ddaa2'), + 'wiki.en': ('wiki.en-8ca82682.bin', '8ca8268250f81b88119949e0fea5a6b81bcac809'), + 'wiki.eo': ('wiki.eo-7baf04e3.bin', '7baf04e353a607bbddb36f439c4033097d854747'), + 'wiki.es': ('wiki.es-422e6f75.bin', '422e6f7582adff418f527ceb01763296c60e1f31'), + 'wiki.et': ('wiki.et-9cf101e3.bin', '9cf101e3cdb6cdf0b0cb16364769cbb26ef8875a'), + 'wiki.eu': ('wiki.eu-f5637868.bin', 'f56378689a3c14d26b0b1df0483c5b96c9a1ec9f'), + 'wiki.ext': ('wiki.ext-daefc0bc.bin', 'daefc0bc266f14f48a58e5ea632796adb9a36540'), + 'wiki.fa': ('wiki.fa-0b8559e6.bin', '0b8559e6b6506e262de3fb55ea9dec03244badca'), + 'wiki.ff': ('wiki.ff-4d6b11b3.bin', '4d6b11b3d6ccdfa2b06d91c78fd05d7a46106582'), + 'wiki.fi': ('wiki.fi-d1d2f60d.bin', 'd1d2f60da48564f659072dee5b2de1306cf2b590'), + 'wiki.fiu_vro': ('wiki.fiu_vro-fc73c1f3.bin', 'fc73c1f3caec0bff7ca7e3ec006c0c9757d2c8f2'), + 'wiki.fj': ('wiki.fj-d3f97816.bin', 'd3f9781664886cd2e332623615a1db8b2781c925'), + 'wiki.fo': ('wiki.fo-04aeaf7c.bin', '04aeaf7cef283cfdd6766af0c65f6a6b13f6040a'), + 'wiki.fr': ('wiki.fr-ee1dde08.bin', 'ee1dde0800113dcd6124ccb643bd1004184b7559'), + 'wiki.frp': ('wiki.frp-0f64bb1b.bin', '0f64bb1b389e30e3af163e526b72bfc4cab8eb7d'), + 'wiki.frr': ('wiki.frr-576ebf02.bin', '576ebf02e6b9b0acf2ea9da6d35e1a5b2c98648b'), + 'wiki.fur': ('wiki.fur-5ebed3c9.bin', '5ebed3c9e39243479326b288202b9e11d1c434c2'), + 'wiki.fy': ('wiki.fy-811bc386.bin', '811bc3864418110fe914f555216372b2f79b7fb5'), + 'wiki.ga': ('wiki.ga-77b3aa66.bin', '77b3aa6640cc25536b965f5dc512503d8d0c2a6a'), + 'wiki.gag': ('wiki.gag-a732c376.bin', 'a732c376f771309bc196fa8758fd5100c67627f6'), + 'wiki.gan': ('wiki.gan-40a8cfd9.bin', '40a8cfd9889646aa722c32dac2e0a5c76689c6a1'), + 'wiki.gd': ('wiki.gd-0dcdb67d.bin', '0dcdb67d346f0f9abc41554b7ca13190aa8fab16'), + 'wiki.gl': ('wiki.gl-44a91a4c.bin', '44a91a4cf3aaaa68feaa3dc10e16f03cb7d41e53'), + 'wiki.glk': ('wiki.glk-43f0bf43.bin', '43f0bf43b98d2e0477b0671ee9fce7f2cb8da6a3'), + 'wiki.gn': ('wiki.gn-29975179.bin', '2997517997b93f71613b9d008e6d95f68a01ad4b'), + 'wiki.gom': ('wiki.gom-08ba082b.bin', '08ba082b769cf631bfcd100631bfdda77980aa54'), + 'wiki.got': ('wiki.got-bfe0a90d.bin', 'bfe0a90d91343f24c8f9faea325f40395ab8bb8e'), + 'wiki.gu': ('wiki.gu-7d49d055.bin', '7d49d05551425e5661f968b9cc0354e15ea0405f'), + 'wiki.gv': ('wiki.gv-eeea71f6.bin', 'eeea71f64e24c80ff07482825f1ce26be19a69fd'), + 'wiki.ha': ('wiki.ha-87a99090.bin', '87a990900d4a74055303585b4d4a89ab7bf0aa47'), + 'wiki.hak': ('wiki.hak-c652e7ff.bin', 'c652e7ff3d63676307bcbd9f4241fb1c6b8cf7ff'), + 'wiki.haw': ('wiki.haw-4f2d842b.bin', '4f2d842b730bec90d0268bff9ca2c8f41fc33987'), + 'wiki.he': ('wiki.he-9c5eb5cd.bin', '9c5eb5cda37954c481d2053bd4e553bbfd34deb4'), + 'wiki.hi': ('wiki.hi-1ca0898a.bin', '1ca0898af562c2ec90a06860ebb27b5a7b0b8cf4'), + 'wiki.hif': ('wiki.hif-8876acba.bin', '8876acbaf8195b94724179aab516234b06f3812a'), + 'wiki.ho': ('wiki.ho-8bc406a1.bin', '8bc406a1defb5703a743a5b424c169b26ffb347c'), + 'wiki.hr': ('wiki.hr-b06384ed.bin', 'b06384ede2bbacae89cb7e94cc9457b0905b410c'), + 'wiki.hsb': ('wiki.hsb-c9cc78b6.bin', 'c9cc78b6e9c1eb2ec3ad57e0f47f2312134de86c'), + 'wiki.ht': ('wiki.ht-da38ff9e.bin', 'da38ff9e8e8e61672422316cba888f2f35fbf9f5'), + 'wiki.hu': ('wiki.hu-a7cd92e6.bin', 'a7cd92e6880b53ee880a7b9de6767a8bd77c9f1a'), + 'wiki.hy': ('wiki.hy-e23e7c36.bin', 'e23e7c36fe63418d46621efee4d4a5248fa7b9ce'), + 'wiki.hz': ('wiki.hz-1a43df11.bin', '1a43df118a1a21f3c9c2d7d43f1cc30900569fdb'), + 'wiki.ia': ('wiki.ia-439e6f2f.bin', '439e6f2f0bf209b7e36e166c8c7e0af6aba34cd2'), + 'wiki.id': ('wiki.id-4ed7d4aa.bin', '4ed7d4aabb54f0af97ca35e31b79711bec2a033e'), + 'wiki.ie': ('wiki.ie-7b0a9761.bin', '7b0a97617ddaf155b898ed9281ff5d2e78e428ef'), + 'wiki.ig': ('wiki.ig-f588d85a.bin', 'f588d85a10ba426ab4825aaaf66d4a87e82b22dd'), + 'wiki.ii': ('wiki.ii-3214212c.bin', '3214212c59f85e40a6fcd1ceb00b36428c1dcc17'), + 'wiki.ik': ('wiki.ik-6bf795cf.bin', '6bf795cf8233e6bfd312f943d301ff5ce70d70b7'), + 'wiki.ilo': ('wiki.ilo-17eb1eff.bin', '17eb1eff170510e85874f83136ead4cc9a2121a6'), + 'wiki.io': ('wiki.io-3f7d30f3.bin', '3f7d30f3abed949dbd089c67e0131f0412fdc84f'), + 'wiki.is': ('wiki.is-e246137d.bin', 'e246137db426f11ec6eb3cbf1bcd1152b0ce0aab'), + 'wiki.it': ('wiki.it-d3019ee2.bin', 'd3019ee2bdafaac7fbb3b9590ce4af35887e3ecc'), + 'wiki.iu': ('wiki.iu-25d55802.bin', '25d558026b0f65cbadd485f63f335faacd192dbd'), + 'wiki.ja': ('wiki.ja-7f4f37fa.bin', '7f4f37fad9c4cff36b221d73bdd8a2d6c5d96518'), + 'wiki.jam': ('wiki.jam-32f692c1.bin', '32f692c1f817d4a5f64e98942e50cfe24356cb43'), + 'wiki.jbo': ('wiki.jbo-2cada509.bin', '2cada509f2e32a01af248c11b76f1e3de313997f'), + 'wiki.jv': ('wiki.jv-2def005b.bin', '2def005b119888a0940302704639c6d47c452839'), + 'wiki.ka': ('wiki.ka-deed211e.bin', 'deed211e28a45a7cec32e9a4a6e0b8015bb5c75a'), + 'wiki.kaa': ('wiki.kaa-31a0f80c.bin', '31a0f80cb3a6e6d1d435489cfbc6a778e6177cbc'), + 'wiki.kab': ('wiki.kab-01dfdf1f.bin', '01dfdf1fcaf0888df6e43bb12e6f680c33181032'), + 'wiki.kbd': ('wiki.kbd-b753b4d4.bin', 'b753b4d4584d3ee158a0e65de8cf1e684b9204a6'), + 'wiki.kg': ('wiki.kg-bd1f271d.bin', 'bd1f271dc512d5e5e6100ddeb09d92ef20f5735b'), + 'wiki.ki': ('wiki.ki-6044fff2.bin', '6044fff24b0951468438c10bd17a67c3042d3df5'), + 'wiki.kj': ('wiki.kj-b4c8d6ad.bin', 'b4c8d6ad7b89e596e92b4ab275758837961c074a'), + 'wiki.kk': ('wiki.kk-3357eee9.bin', '3357eee9c3193859b2a254e7e03167f9e133aefa'), + 'wiki.kl': ('wiki.kl-2ae394ee.bin', '2ae394ee6b61bac8732b97a8b09783e2d49e7a64'), + 'wiki.km': ('wiki.km-ef0eed3b.bin', 'ef0eed3b83c8f54c7f124fd3f8d9c0200d981f8a'), + 'wiki.kn': ('wiki.kn-7ba6e9f2.bin', '7ba6e9f2da563dbd9a657032c8eb99c5b56cdba3'), + 'wiki.ko': ('wiki.ko-ae46f52b.bin', 'ae46f52bd2534a01b601449af7f2eccfa1c06719'), + 'wiki.koi': ('wiki.koi-fa0c7bdd.bin', 'fa0c7bdd198e6afd88b0f735796faa43a8dc33fb'), + 'wiki.kr': ('wiki.kr-08696f70.bin', '08696f7016cc384fd8170e6cf485da5f17d1b7cd'), + 'wiki.krc': ('wiki.krc-ff8a4631.bin', 'ff8a46318685e56cb5cf5eed87ec899c5352b963'), + 'wiki.ks': ('wiki.ks-19d31479.bin', '19d31479421121b35d4f1281846033125f8c4ad7'), + 'wiki.ksh': ('wiki.ksh-97112af8.bin', '97112af8d32fae08270e7a88cd4da3af85e23b35'), + 'wiki.ku': ('wiki.ku-0ef76a6f.bin', '0ef76a6ff71586b155eda3273205b1b7b08cd73d'), + 'wiki.kv': ('wiki.kv-a50d93bf.bin', 'a50d93bf0db87356b6e8f794a37c989392195277'), + 'wiki.kw': ('wiki.kw-77ef77b3.bin', '77ef77b3323955a9825c992868ba83cb80775e8e'), + 'wiki.ky': ('wiki.ky-3f5928bc.bin', '3f5928bca0dc2551c2a9d2b25a90aa957e9742bf'), + 'wiki.la': ('wiki.la-3fe4d514.bin', '3fe4d51458c4b10878b671c8a7c0550b4e7d6baa'), + 'wiki.lad': ('wiki.lad-753bb201.bin', '753bb20136b3372cdcddf3bf5cbbef2910039490'), + 'wiki.lb': ('wiki.lb-827a0e46.bin', '827a0e46d57af4b8b59cd921a072ee0c0dd713ca'), + 'wiki.lbe': ('wiki.lbe-2934dfa5.bin', '2934dfa5f791f36f46abff43fcafce5ea6726a59'), + 'wiki.lez': ('wiki.lez-f344710f.bin', 'f344710ffb5eb75b59fb97b9a91be1d6c01329eb'), + 'wiki.lg': ('wiki.lg-3c31935e.bin', '3c31935eeb7165c4bdc28a77644c82703e61deb7'), + 'wiki.li': ('wiki.li-a9214c96.bin', 'a9214c96d8df517c68bf95c0e832905e85c7a6a9'), + 'wiki.lij': ('wiki.lij-55fead2b.bin', '55fead2bf98a2ee9e19ba5890ac2d5d82679b21e'), + 'wiki.lmo': ('wiki.lmo-b3a6ce73.bin', 'b3a6ce73aa41892ded828247f22cf41c71c5044d'), + 'wiki.ln': ('wiki.ln-403279f9.bin', '403279f918ab9409df69f1582ad850aa02046696'), + 'wiki.lo': ('wiki.lo-a5903d28.bin', 'a5903d2807e8088df6cef9e88ad9e3fc42dbe17e'), + 'wiki.lrc': ('wiki.lrc-07de075f.bin', '07de075f09cb54767ec7e79114311ca5e58f6d3e'), + 'wiki.lt': ('wiki.lt-62e95727.bin', '62e957278fe6c45f73071c4c91e68d4f02a9fe20'), + 'wiki.ltg': ('wiki.ltg-1351a4b3.bin', '1351a4b3a671bd208f5a6527d9d9047124007f78'), + 'wiki.lv': ('wiki.lv-991eae2a.bin', '991eae2ae2cbe77a89133bb42c5829f15cf59c9e'), + 'wiki.mai': ('wiki.mai-56cee5cb.bin', '56cee5cbeb259bc26927e15baf68c7e0b88786ab'), + 'wiki.map_bms': ('wiki.map_bms-87356c93.bin', '87356c93b166304986df7b5df52728e896d58180'), + 'wiki.mdf': ('wiki.mdf-5c5f1c2c.bin', '5c5f1c2c499bed790b81c1954edad644b1d137aa'), + 'wiki.mg': ('wiki.mg-a1b18be8.bin', 'a1b18be8864cc40d2ef5be324783de32e35d3c40'), + 'wiki.mh': ('wiki.mh-14147c18.bin', '14147c18cd451b7693babf995edeafb1a03a6f01'), + 'wiki.mhr': ('wiki.mhr-204edb17.bin', '204edb1741f0b04566f0e2a9bce46e8a2411ce32'), + 'wiki.mi': ('wiki.mi-ef6b4e35.bin', 'ef6b4e35b9f3dc7aa8803bb422a48c72eaea64ec'), + 'wiki.min': ('wiki.min-9a0a9ebc.bin', '9a0a9ebc607286f43cb6dc8cd06e2fa8c8986f12'), + 'wiki.mk': ('wiki.mk-4100301f.bin', '4100301fafb6ff4c5531187c54c11e9f24679657'), + 'wiki.ml': ('wiki.ml-e08b01b3.bin', 'e08b01b34318ca3eebedcfd6d44c832a0f16dd37'), + 'wiki.mn': ('wiki.mn-f6a269d2.bin', 'f6a269d2225d518acbb45cc3f8c5cbc2a318ee30'), + 'wiki.mo': ('wiki.mo-394239a3.bin', '394239a3d27bed433cad8c70e5bba16a0a30838c'), + 'wiki.mr': ('wiki.mr-37b4bb82.bin', '37b4bb82fa499080e015b3081b85c8c3c5e3e6b9'), + 'wiki.mrj': ('wiki.mrj-37ca837d.bin', '37ca837dde630945217e46ff4065e5a2e795bc93'), + 'wiki.ms': ('wiki.ms-5b4ddb79.bin', '5b4ddb79ab02d5638ee69d60a6191b65fda5ce86'), + 'wiki.mt': ('wiki.mt-b3323fa1.bin', 'b3323fa1038143a3e791bc352e6964f197213af5'), + 'wiki.mus': ('wiki.mus-f005e240.bin', 'f005e2408799819e6337aa62afb8d92e713f60d6'), + 'wiki.mwl': ('wiki.mwl-f0838820.bin', 'f0838820a819b3de537d6e2e01c543ffea2cf2fc'), + 'wiki.my': ('wiki.my-d64aad4e.bin', 'd64aad4e6f5c6c7319d794e657e96718c752277f'), + 'wiki.myv': ('wiki.myv-0464a2f4.bin', '0464a2f4ffde9637f93d59f2004e350b15b1f4eb'), + 'wiki.mzn': ('wiki.mzn-872ffc98.bin', '872ffc987bff427a3bd7edab5850658f07611fed'), + 'wiki.na': ('wiki.na-d7b31b79.bin', 'd7b31b7991767f4d328347dcc871d11dffdd629d'), + 'wiki.nah': ('wiki.nah-712f2493.bin', '712f2493b87f6ce3f8991f842e91ae18f93e7357'), + 'wiki.nap': ('wiki.nap-11b97cb5.bin', '11b97cb5d41fa479322a569dbaf79b7e3c26f819'), + 'wiki.nds': ('wiki.nds-305ba618.bin', '305ba618d97b6b6b334a28df888ffe624ae4b9c9'), + 'wiki.nds_nl': ('wiki.nds_nl-fb880749.bin', 'fb880749b3326a03de850e064a65f89a4e069e40'), + 'wiki.ne': ('wiki.ne-72e739fd.bin', '72e739fd99cfda922ff3b8262550df087d59581d'), + 'wiki.new': ('wiki.new-8f66c97a.bin', '8f66c97ab6e30c0ee9ff350818e7df2da18636a0'), + 'wiki.ng': ('wiki.ng-bc569540.bin', 'bc569540c4fd3d4ddb6ab7bf1c63d3e576b0d0fa'), + 'wiki.nl': ('wiki.nl-40d9776a.bin', '40d9776a8466d3eebdc9c8e6b385bdd3271ea126'), + 'wiki.nn': ('wiki.nn-d4e5918c.bin', 'd4e5918cde9a4354dcc3c4174ae4fca97392b250'), + 'wiki.no': ('wiki.no-18db3b96.bin', '18db3b96776c453b37c343c663516a9b937635ae'), + 'wiki.nov': ('wiki.nov-3056416e.bin', '3056416e12e7d08fc4fdd85f0173a6dd72c40ab6'), + 'wiki.nrm': ('wiki.nrm-1a0a8daf.bin', '1a0a8daf48da8476dcef8018bae70e20dd027e36'), + 'wiki.nso': ('wiki.nso-977d9079.bin', '977d9079818970d709f50a1cd6a5917875b805fc'), + 'wiki.nv': ('wiki.nv-4dfd868a.bin', '4dfd868ae77b006a34e873180698e7152aef910e'), + 'wiki.ny': ('wiki.ny-ad7f85e7.bin', 'ad7f85e700fd386d88e5c82a53773afd493c7a3d'), + 'wiki.oc': ('wiki.oc-289fe81d.bin', '289fe81dbadcbd7ea052545b73c46206c34864ec'), + 'wiki.olo': ('wiki.olo-9721fefd.bin', '9721fefd1e1b1c3e51a0d8543f8c5473984fe2c4'), + 'wiki.om': ('wiki.om-71345bb8.bin', '71345bb8c3186e27bd4d1d4808a8cb097903e1bf'), + 'wiki.or': ('wiki.or-f55c1ca8.bin', 'f55c1ca84126f315312e2a7ed45e32ab91b8e636'), + 'wiki.os': ('wiki.os-c0148462.bin', 'c0148462d638e5dc94cef3ec607d8b6ef9672156'), + 'wiki.pa': ('wiki.pa-a0903f78.bin', 'a0903f78b47f32e11e4c62173bbedebb64a510e0'), + 'wiki.pag': ('wiki.pag-00e02108.bin', '00e021082f3a14b5603be9cc201b145405498f2a'), + 'wiki.pam': ('wiki.pam-dfe4c21c.bin', 'dfe4c21c29e4022f1ab848c060c3acb697d279f9'), + 'wiki.pap': ('wiki.pap-a3766e93.bin', 'a3766e936218cbd5e8ac0c279209960a7908b15a'), + 'wiki.pcd': ('wiki.pcd-986d648c.bin', '986d648ce805d15c3fa0e706c7a96c5d02e22298'), + 'wiki.pdc': ('wiki.pdc-031d7283.bin', '031d7283179b75f0f62e427b58eef4ecc7211243'), + 'wiki.pfl': ('wiki.pfl-826fc525.bin', '826fc525d7542d9c9a6ac3b72cfb01f438f6d33f'), + 'wiki.pi': ('wiki.pi-c0ddd653.bin', 'c0ddd653f5c89cea4eca90de1b0a8f78111e77a4'), + 'wiki.pih': ('wiki.pih-a71523db.bin', 'a71523dba29a0f98e6fd52c4bac3f984cdfcd0b2'), + 'wiki.pl': ('wiki.pl-b600fdc2.bin', 'b600fdc2f1d62c062ec6028d0652ede539081381'), + 'wiki.pms': ('wiki.pms-7e429bdf.bin', '7e429bdf7094257c372008d5ef1638f813f60da0'), + 'wiki.pnb': ('wiki.pnb-08ab3d18.bin', '08ab3d185ebc76a6636cb4da169a6904361ba048'), + 'wiki.pnt': ('wiki.pnt-14458f30.bin', '14458f3083992b5cddd87f2b9cbcd152377ff97b'), + 'wiki.ps': ('wiki.ps-b86afb94.bin', 'b86afb94162924807dc8e4c75d80fcfb71acda76'), + 'wiki.pt': ('wiki.pt-f971330c.bin', 'f971330cf54742dd17d1cb6245c1b3b468b42978'), + 'wiki.qu': ('wiki.qu-7cbfa5fd.bin', '7cbfa5fdb2d41e0569209c4a297a54b80743bc17'), + 'wiki.rm': ('wiki.rm-15cc9e74.bin', '15cc9e74f2ea0db93f8914a357e9b60840607b08'), + 'wiki.rmy': ('wiki.rmy-fdc5afea.bin', 'fdc5afeae1f185a2a551e091bf3d7731ba92223d'), + 'wiki.rn': ('wiki.rn-9859ac6f.bin', '9859ac6f33ae14670772f0fa06957127ffba60d8'), + 'wiki.ro': ('wiki.ro-be264092.bin', 'be264092886bd9cdfa3e0fe43896ee7e7b8039ad'), + 'wiki.roa_rup': ('wiki.roa_rup-ab463b13.bin', 'ab463b13e26f3c9ab99598ddcb722e70cf5bf35a'), + 'wiki.roa_tara': ('wiki.roa_tara-073b0d19.bin', '073b0d192290b9c45463ddc11f043ff1ec42371e'), + 'wiki.ru': ('wiki.ru-5fbe8dd5.bin', '5fbe8dd58e6f6f58fb430e948e4d6a3b6cd6b603'), + 'wiki.rue': ('wiki.rue-1d09f6c5.bin', '1d09f6c53dc669f2fed52b511ba551c9ebb5caa4'), + 'wiki.rw': ('wiki.rw-e5e6abb9.bin', 'e5e6abb981ce2898a10f19c092033111d881917b'), + 'wiki.sa': ('wiki.sa-a651ca36.bin', 'a651ca363e7b431f9a2340557fa54163ff523764'), + 'wiki.sah': ('wiki.sah-0a0642a3.bin', '0a0642a36a863726bf8009957505101b74e2d8ef'), + 'wiki.sc': ('wiki.sc-9a6d6d82.bin', '9a6d6d82cb0bf57cf0b05ecaadfa0614712ddca7'), + 'wiki.scn': ('wiki.scn-2beb5569.bin', '2beb556906fc10a70a5dc5d8359c57ddbb567620'), + 'wiki.sco': ('wiki.sco-99406b00.bin', '99406b00be5c98fc69e561b2cd6bf29549514d5d'), + 'wiki.sd': ('wiki.sd-f9bafcac.bin', 'f9bafcacfc1b99aa5235ef6ff23f0532a638a89d'), + 'wiki.se': ('wiki.se-4ec410b7.bin', '4ec410b7963d85535bbc40429e3e74597aa102a3'), + 'wiki.sg': ('wiki.sg-05416c3e.bin', '05416c3e636d6b4612966de3476262760dc1b6af'), + 'wiki.sh': ('wiki.sh-d1262628.bin', 'd126262872b90d3d9d3d4a91c009a1a4a8ba6c38'), + 'wiki.si': ('wiki.si-cba0721f.bin', 'cba0721fa9b6d6ef470da3f852c475cef2af793e'), + 'wiki.simple': ('wiki.simple-67424112.bin', '67424112bc1879dc1288c9081ac3292aed1027e9'), + 'wiki.sk': ('wiki.sk-42878f2a.bin', '42878f2afd7633bdb5c2b4f742c4d1b13377eb00'), + 'wiki.sl': ('wiki.sl-3df13aa9.bin', '3df13aa9cf7c2c5a8b7a655e129cca46bc7635d7'), + 'wiki.sm': ('wiki.sm-0b04a63e.bin', '0b04a63e3cdc04722530f7b1d60dfa01368191b8'), + 'wiki.sn': ('wiki.sn-30f67c3f.bin', '30f67c3f26a074888813ebf67d9ae9c4d5863d2f'), + 'wiki.so': ('wiki.so-8466ae2e.bin', '8466ae2ea133cac06cceb240934e826a9e304d89'), + 'wiki.sq': ('wiki.sq-4068a1bc.bin', '4068a1bcce63aee6af1aabfd8139dc6508c017cf'), + 'wiki.sr': ('wiki.sr-082e3132.bin', '082e3132151777aa9f20f005dbb59b78f415cb5f'), + 'wiki.srn': ('wiki.srn-b7f15abf.bin', 'b7f15abf361445aa2bb621b71f7e70bbf88cb547'), + 'wiki.ss': ('wiki.ss-d84b4c58.bin', 'd84b4c58fc78d5bfd8e0ae3909fa6ad8966ed5a2'), + 'wiki.st': ('wiki.st-41bbfe88.bin', '41bbfe88fcc624496d35ed1801da3c51dd444c43'), + 'wiki.stq': ('wiki.stq-7869010b.bin', '7869010b7edd26d51849f676ef1a4f39dc83ec55'), + 'wiki.su': ('wiki.su-522015ae.bin', '522015ae575b367ba73baed722168c6035bf1a4f'), + 'wiki.sv': ('wiki.sv-2b51c008.bin', '2b51c00867be3483aa123b82af93bab8fc596886'), + 'wiki.sw': ('wiki.sw-b19842d4.bin', 'b19842d48289ca2a81b7b22e34b467cf8dfb9a26'), + 'wiki.szl': ('wiki.szl-19f27783.bin', '19f277833eea6f5d878114e54dc097764f963efb'), + 'wiki.ta': ('wiki.ta-0746a2ef.bin', '0746a2ef062efecd5268146d64c060ef1e9144f2'), + 'wiki.tcy': ('wiki.tcy-0ff14866.bin', '0ff1486683d6b736a04305e8625f8d3850c6a3a8'), + 'wiki.te': ('wiki.te-0dc2bd97.bin', '0dc2bd97553cc270d3d4154c35fcfce6998a88bd'), + 'wiki.tet': ('wiki.tet-670940fc.bin', '670940fc5658e21740f8d9ce442e6133b5e3e260'), + 'wiki.tg': ('wiki.tg-488952ce.bin', '488952ce406c4deafd1d5dfb1336f49341faef55'), + 'wiki.th': ('wiki.th-7332f0f8.bin', '7332f0f8c42f5b60cd4e2676f7b806cc942a4ab0'), + 'wiki.ti': ('wiki.ti-398f25bb.bin', '398f25bb3685800cc46aab59b084726c8cb6cd74'), + 'wiki.tk': ('wiki.tk-c281e62a.bin', 'c281e62aa025f18f4c0006939cf42446b12cd0f3'), + 'wiki.tl': ('wiki.tl-929953fd.bin', '929953fd9b07fbad5bc79cd62703b3fb1a59ba1a'), + 'wiki.tn': ('wiki.tn-ea8fac80.bin', 'ea8fac801735804eb98971cf823c89af496be814'), + 'wiki.to': ('wiki.to-bcc965ca.bin', 'bcc965cace5fe8c359b97a9389188ea04077c1bf'), + 'wiki.tpi': ('wiki.tpi-efd371af.bin', 'efd371af61d7ce4d1b7922e33790b72bc8f3c4d5'), + 'wiki.tr': ('wiki.tr-9c2c0a70.bin', '9c2c0a7008cd89cbe211a909431f7896a630f686'), + 'wiki.ts': ('wiki.ts-8958dcb3.bin', '8958dcb31274f4fa1ac96cff7de2ad12b0594ff2'), + 'wiki.tt': ('wiki.tt-4835092f.bin', '4835092f1ee35ece995f73712c11b128d7e6d8e3'), + 'wiki.tum': ('wiki.tum-20684599.bin', '206845992b8ec51f673408f11c6383a7ddc75faf'), + 'wiki.tw': ('wiki.tw-6fcf965e.bin', '6fcf965e4f74a6bccac892ebebb7e305e4d747c3'), + 'wiki.ty': ('wiki.ty-cf5c1022.bin', 'cf5c10220157851050babc0086af7e0812be9fb4'), + 'wiki.tyv': ('wiki.tyv-dd1284c9.bin', 'dd1284c9f847ebcc171616e0e60b5d97bdfa1808'), + 'wiki.udm': ('wiki.udm-85b368ec.bin', '85b368ec3981bd3fb0599f6dc8287ff8c9dac2aa'), + 'wiki.ug': ('wiki.ug-57acb78f.bin', '57acb78f7370a0766145cd9ca6fc11e0543f1385'), + 'wiki.uk': ('wiki.uk-747aa3ab.bin', '747aa3ab24fd860f6304fc021ef281facc4c039d'), + 'wiki.ur': ('wiki.ur-b89e16db.bin', 'b89e16dbc9ec62f9f91bce2a44447f7cdc830453'), + 'wiki.uz': ('wiki.uz-e8a38b09.bin', 'e8a38b091ff4ecdb8ded1a20bb182ba3ef73c6aa'), + 'wiki.ve': ('wiki.ve-5c1b48c3.bin', '5c1b48c309cb43717bcaeecb33e26cdcfd46cdec'), + 'wiki.vec': ('wiki.vec-34e6f3d9.bin', '34e6f3d94843381be488b37eb06600aff3ab3d6b'), + 'wiki.vep': ('wiki.vep-660b5d1e.bin', '660b5d1e7f74f97fe18f310736bf8950d4696c67'), + 'wiki.vi': ('wiki.vi-99ab162f.bin', '99ab162f703a0a6fb03a64142b6d47013e0314a3'), + 'wiki.vls': ('wiki.vls-07e0742e.bin', '07e0742e4946f2ae057be2ce84e327053a916c91'), + 'wiki.vo': ('wiki.vo-562905a3.bin', '562905a3c920bdf4b9bf4a9fb1b3a1293883a905'), + 'wiki.wa': ('wiki.wa-727a61c7.bin', '727a61c7115e7093ce8e2c1dc1f3e164eb2654cc'), + 'wiki.war': ('wiki.war-bcad746f.bin', 'bcad746f062fa166ceebc47df3449f4752448be5'), + 'wiki.wo': ('wiki.wo-5a3815d8.bin', '5a3815d82f535f6d35cd7058cc757117f2a41ac2'), + 'wiki.wuu': ('wiki.wuu-b114fb8d.bin', 'b114fb8d1ca2ba54ec3e4295f93ae6f33b7eed16'), + 'wiki.xal': ('wiki.xal-45449f93.bin', '45449f936d0ea7a57e7b287702fd09a715180efb'), + 'wiki.xh': ('wiki.xh-7b5a743d.bin', '7b5a743dca7f9ed9d0a8760169e681ccc5d00e54'), + 'wiki.xmf': ('wiki.xmf-755644bf.bin', '755644bfb0223e56e4d3ef7ae113f057b9143a18'), + 'wiki.yi': ('wiki.yi-191d3a6a.bin', '191d3a6a6676566f21ddf2b9a645e7dadd719600'), + 'wiki.yo': ('wiki.yo-2629d292.bin', '2629d292d16a7a6f80a12c1ca446b7ba4be56508'), + 'wiki.za': ('wiki.za-0f26bdfb.bin', '0f26bdfb74dacc74b3690d8b00c9ce92e2598152'), + 'wiki.zea': ('wiki.zea-f40226d3.bin', 'f40226d3c27c013b0690a3dba70b36cec1233aa6'), + 'wiki.zh': ('wiki.zh-69e1fa5f.bin', '69e1fa5f3a7a1625789e5eeb47d3bfe72506f403'), + 'wiki.zh_classical': ('wiki.zh_classical-ac01671b.bin', + 'ac01671b3fd0baadbc5fd850132ef4c9891b7e55'), + 'wiki.zh_min_nan': ('wiki.zh_min_nan-5b773206.bin', '5b773206277e3c47fdfb110e43797bad644c1feb'), + 'wiki.zh_yue': ('wiki.zh_yue-2e504f07.bin', '2e504f07395f4ac5f732d8c3ee3594b431c2eb64'), + 'wiki.zu': ('wiki.zu-642b157b.bin', '642b157b3b799cfb50b13eda0b7d156698cdde83'), +} + diff --git a/src/gluonnlp/embedding/embed_loader.py b/src/gluonnlp/embedding/embed_loader.py new file mode 100644 index 0000000000..5a349595eb --- /dev/null +++ b/src/gluonnlp/embedding/embed_loader.py @@ -0,0 +1,320 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=consider-iterating-dictionary, too-many-lines +"""Load token embedding""" + +__all__ = [ + 'list_sources', 'load_embeddings', 'get_fasttext_model' +] + +import io +import logging +import os +import warnings +import fasttext + +import numpy as np +from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url + +from . import _constants as C +from ..base import get_home_dir +from ..data import Vocab + +text_embedding_reg = { + 'glove' : C.GLOVE_NPZ_SHA1, + 'word2vec' : C.WORD2VEC_NPZ_SHA1, + 'fasttext' : C.FAST_TEXT_NPZ_SHA1 +} +def list_sources(embedding_name=None): + """Get valid token embedding names and their pre-trained file names. + + Parameters + ---------- + embedding_name : str or None, default None + The pre-trained token embedding name. + + Returns + ------- + dict or list: + A list of all the valid pre-trained token embedding file names (`source`) for the + specified token embedding name (`embedding_name`). If the text embedding name is set to + None, returns a dict mapping each valid token embedding name to a list of valid pre-trained + files (`source`). + """ + if embedding_name is not None: + embedding_name = embedding_name.lower() + if embedding_name == 'fasttext.bin': + return list(C.FAST_TEXT_BIN_SHA1.keys()) + if embedding_name not in text_embedding_reg: + raise KeyError('Cannot find `embedding_name` {}. Use ' + '`list_sources(embedding_name=None).keys()` to get all the valid' + 'embedding names.'.format(embedding_name)) + return list(text_embedding_reg[embedding_name].keys()) + else: + return {embedding_name: list(embedding_cls.keys()) + for embedding_name, embedding_cls in text_embedding_reg.items()} + +def _append_unk_vecs(matrix, vocab_size): + append_dim = vocab_size - len(matrix) + assert append_dim in [0, 1], "Error occurs in the embedding file." + if append_dim == 1: + # there is no unknown_token in the embedding file + mean = np.mean(found_vectors, axis=0, keepdims=True) + std = np.std(found_vectors, axis=0, keepdims=True) + vecs = np.random.randn(append_dim, dim).astype('float32') * std + mean + return np.concatenate([matrix, vecs], axis=0) + return matrix + +def _load_embedding_txt(file_path, vocab, unknown_token): + if vocab is not None: + result = np.zeros(len(vocab), dtype=bool) + else: + result = [] + with open(file_path, 'r', encoding='utf-8') as f: + line = f.readline().strip() + parts = line.split() + start_idx = 0 + if len(parts) == 2: + dim = int(parts[1]) + start_idx += 1 + else: + dim = len(parts) - 1 + f.seek(0) + if vocab is None: + matrix = [] + else: matrix = np.random.randn(len(vocab), dim).astype('float32') + for idx, line in enumerate(f, start_idx): + try: + parts = line.strip().split() + word = ''.join(parts[:-dim]) + nums = parts[-dim:] + if vocab is None: + result.append(word) + matrix.append(np.fromstring(' '.join(nums), sep=' ', dtype='float32', count=dim)) + else: + if word == unknown_token and vocab.unk_token is not None: + word = vocab.unk_token + if word in vocab: + index = vocab[word] + matrix[index] = np.fromstring(' '.join(nums), sep=' ', dtype='float32', count=dim) + result[index] = True + except Exception as e: + logging.error("Error occurred at the {} line.".format(idx)) + raise e + if vocab is None: + result = Vocab(result, unk_token=unknown_token) + matrix = _append_unk_vecs(np.array(matrix), len(result)) + return matrix, result + +def _load_embedding_npz(file_path, vocab, unknown): + if vocab is not None: + result = np.zeros(len(vocab), dtype=bool) + else: + result = [] + npz_dict = np.load(file_path, allow_pickle=True) + unknown_token = npz_dict['unknown_token'] + if not unknown_token: + unknown_token = unknown + else: + if isinstance(unknown_token, np.ndarray): + if unknown_token.dtype.kind == 'S': + unknown_token = unknown_token.tobytes().decode() + else: + unknown_token = str(unknown_token) + if unknown != unknown_token: + warnings.warn("You may not assign correct unknown token in the pretrained file" + "Use {} as the unknown mark.".format(unknown_token)) + + idx_to_token = npz_dict['idx_to_token'].tolist() + token2idx = {x : i for i, x in enumerate(idx_to_token)} + idx_to_vec = npz_dict['idx_to_vec'] + if vocab is None: + result = Vocab(idx_to_token, unk_token=unknown_token) + idx_to_vec = _append_unk_vecs(idx_to_vec, len(result)) + return idx_to_vec, result + else: + matrix = np.random.randn(len(vocab), idx_to_vec.shape[-1]).astype('float32') + for i, token in enumerate(vocab.all_tokens): + if token == vocab.unk_token and unknown_token is not None: + word = unknown_token + else: + word = token + if word in token2idx: + index = token2idx[word] + matrix[i] = idx_to_vec[index] + result[i] = True + return matrix, result + +def _get_file_url(cls_name, file_name): + namespace = 'gluon/embeddings/{}'.format(cls_name) + return _get_repo_file_url(namespace, file_name) + +def _get_file_path(cls_name, file_name, file_hash): + root_path = os.path.expanduser(os.path.join(get_home_dir(), 'embedding')) + embedding_dir = os.path.join(root_path, cls_name) + url = _get_file_url(cls_name, file_name) + file_path = os.path.join(embedding_dir, file_name) + if not os.path.exists(file_path) or not check_sha1(file_path, file_hash): + logging.info('Embedding file {} is not found. Downloading from Gluon Repository. ' + 'This may take some time.'.format(file_name)) + download(url, file_path, sha1_hash=file_hash) + return file_path + +def _check_and_get_path(pretrained_name_or_dir): + if os.path.exists(pretrained_name_or_dir): + return pretrained_name_or_dir + for cls_name, embedding_cls in text_embedding_reg.items(): + if pretrained_name_or_dir in embedding_cls: + source = pretrained_name_or_dir + file_name, file_hash = embedding_cls[source] + return _get_file_path(cls_name, file_name, file_hash) + + return None + +def load_embeddings(vocab=None, pretrained_name_or_dir='glove.6B.50d', unknown='', + unk_method=None): + """Load pretrained word embeddings for building an embedding matrix for a given Vocab. + + This function supports loading GloVe, Word2Vec and FastText word embeddings from remote sources. + You can also load your own embedding file(txt with Word2Vec or GloVe format) from a given file path. + + Glove: an unsupervised learning algorithm for obtaining vector representations for words. + Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and + the resulting representations showcase interesting linear substructures of the word vector + space. (Source from https://nlp.stanford.edu/projects/glove/) + Available sources: + ['glove.42B.300d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d', 'glove.6B.50d', \ + 'glove.840B.300d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', \ + 'glove.twitter.27B.25d', 'glove.twitter.27B.50d'] + Word2Vec: an unsupervised learning algorithm for obtaining vector representations for words. + Training is performed with continuous bag-of-words or skip-gram architecture for computing vector + representations of words. + Available sources: + ['GoogleNews-vectors-negative300', 'freebase-vectors-skipgram1000', + 'freebase-vectors-skipgram1000-en'] + FastText: an open-source, free, lightweight library that allows users to learn text + representations and text classifiers. It works on standard, generic hardware. Models can later + be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/) + Available sources: + ['cc.af.300', ..., 'cc.en.300', ..., 'crawl-300d-2M', 'crawl-300d-2M-subword', \ + 'wiki-news-300d-1M', 'wiki-news-300d-1M-subword', \ + 'wiki.aa', ..., 'wiki.multi.ar', ..., 'wiki.zu'] + Detailed sources can be founded by `gluonnlp.embedding.list_sources('FastText')` + For 'wiki.multi' embedding: + + Word Translation Without Parallel Data + Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou. + https://arxiv.org/abs/1710.04087 + + Parameters + ---------- + vocab : gluonnlp.data.Vocab object, default None + A vocabulary on which an embedding matrix is built. + If `vocab` is `None`, then all tokens in the pretrained file will be used. + pretrained_name_or_dir : str, default 'glove.6B.50d' + A file path for a pretrained embedding file or the name of the pretrained token embedding file. + This method would first check if it is a file path. + If not, the method will load from cache or download. + unknown : str, default '' + To specify the unknown token in the pretrained file. + unk_method : Callable, default None + A function which receives `List[str]` and returns `numpy.ndarray`. + The input of the function is a list of words which are in the `vocab`, + but do not occur in the pretrained file. + And the function is aimed to return an embedding matrix for these words. + If `unk_method` is None, we generate vectors for these words, + by sampling from normal distribution with the same std and mean of the embedding matrix. + It is only useful when `vocab` is not `None`. + + Returns + ------- + If `vocab` is `None` + numpy.ndarray: + An embedding matrix in the pretrained file. + gluonnlp.data.Vocab: + The vocabulary in the pretrained file. + Otherwise, + numpy.ndarray: + An embedding matrix for the given vocabulary. + """ + assert isinstance(vocab, (Vocab, type(None))), "Only gluonnlp.data.Vocab is supported." + file_path = _check_and_get_path(pretrained_name_or_dir) + if file_path is None: + raise ValueError("Cannot recognize `{}`".format(pretrained_name_or_dir)) + + if file_path.endswith('.npz'): + matrix, result = _load_embedding_npz(file_path, vocab, unknown) + else: + matrix, result = _load_embedding_txt(file_path, vocab, unknown) + dim = matrix.shape[-1] + logging.info("Pre-trained embedding dim: {}".format(dim)) + if vocab is None: + return matrix, result + else: + hit_flags = result + total_hits = sum(hit_flags) + logging.info("Found {} out of {} words in the pretrained embedding.".format(total_hits, len(vocab))) + if total_hits != len(vocab): + if unk_method is None: + found_vectors = matrix[hit_flags] + mean = np.mean(found_vectors, axis=0, keepdims=True) + std = np.std(found_vectors, axis=0, keepdims=True) + unfound_vec_num = len(vocab) - total_hits + r_vecs = np.random.randn(unfound_vec_num, dim).astype('float32') * std + mean + matrix[hit_flags == False] = r_vecs + else: + unk_idxs = (hit_flags == False).nonzero()[0] + matrix[hit_flags == False] = unk_method(vocab.to_tokens(unk_idxs)) + + return matrix + +def get_fasttext_model(model_name_or_dir='cc.en.300'): + """ Load fasttext model from the binaray file + + This method will load fasttext model binaray file from a given file path or remote sources, + and return a `fasttext` model object. See `fasttext.cc` for more usage information. + + Available sources: + ['wiki-news-300d-1M-subword', 'crawl-300d-2M-subword', \ + 'cc.af.300', ..., 'cc.en.300', ..., 'wiki.aa', ..., 'wiki.en', ..., 'wiki.zu'] + Detailed sources can be founded by `gluonnlp.embedding.list_sources('FastText.bin')` + + Parameters + ---------- + model_name_or_dir : str, default 'cc.en.300' + A file path for a FastText binary file or the name of the FastText model. + This method would first check if it is a file path. + If not, the method will load from cache or download. + + Returns + ------- + fasttext.FastText._FastText: + A FastText model based on `fasttext` package. + """ + if os.path.exists(model_name_or_dir): + file_path = model_name_or_dir + else: + source = model_name_or_dir + root_path = os.path.expanduser(os.path.join(get_home_dir(), 'embedding')) + embedding_dir = os.path.join(root_path, 'fasttext') + if source not in C.FAST_TEXT_BIN_SHA1: + raise ValueError('Cannot recognize {} for the bin file'.format(source)) + file_name, file_hash = C.FAST_TEXT_BIN_SHA1[source] + file_path = _get_file_path('fasttext', file_name, file_hash) + return fasttext.load_model(file_path) + diff --git a/src/gluonnlp/op.py b/src/gluonnlp/op.py index ba5bf7607a..a4762b4ad3 100644 --- a/src/gluonnlp/op.py +++ b/src/gluonnlp/op.py @@ -290,3 +290,22 @@ def relative_position_bucket(F, relative_position, val_if_large = F.np.minimum(val_if_large, num_buckets - 1) ret = ret + F.np.where(is_small, relative_position, val_if_large) return ret + + +def l2_normalize(F, data, axis=-1, eps=1e-6): + """Normalize the data by L2 normalization. + + Parameters + ---------- + F : mx.sym or mx.nd + data : symbol or ndarray + axis : int, default -1 + eps : float, default 1e-6 + + Returns + ------- + ret : mx.sym or mx.nd + """ + ret = data / (F.np.linalg.norm(data, axis=axis, keepdims=True) + eps) + return ret + diff --git a/tests/test_embedding.py b/tests/test_embedding.py new file mode 100644 index 0000000000..b9be912339 --- /dev/null +++ b/tests/test_embedding.py @@ -0,0 +1,50 @@ +import numpy as np +import collections +import os +import tempfile +import pytest +from gluonnlp.embedding import load_embeddings, get_fasttext_model +from gluonnlp.data import Vocab + +def test_load_embeddings(): + text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world', 'sadgood'] + counter = collections.Counter(text_data) + vocab1 = Vocab(counter) + # load with vocab + matrix1 = load_embeddings(vocab1) + assert len(matrix1) == len(vocab1) + # load without vocab + matrix2, vocab2 = load_embeddings() + assert len(matrix2) == len(vocab2) + np.testing.assert_almost_equal(matrix1[vocab1["hello"]], matrix2[vocab2["hello"]]) + + # test_unk_method + def simple(words): + return np.ones((len(words), 50)) + matrix3 = load_embeddings(vocab1, unk_method=simple) + assert sum(matrix3[vocab1['sadgood']] == 1) == matrix3.shape[-1] + np.testing.assert_almost_equal(matrix3[vocab1["hello"]], matrix2[vocab2["hello"]]) + + # load txt + with tempfile.TemporaryDirectory() as root: + path = os.path.join(root, "tmp.txt") + with open(path, "w") as f: + f.write("{} {}\n".format(matrix1.shape[0], matrix1.shape[1])) + for word, vec in zip(vocab1.all_tokens, matrix1): + f.write(word + " ") + f.write(" ".join([str(num) for num in vec.tolist()])) + f.write("\n") + matrix4 = load_embeddings(vocab1, path) + np.testing.assert_almost_equal(matrix4, matrix1) + + +def test_get_fasttext_model(): + text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world'] + counter = collections.Counter(text_data) + vocab1 = Vocab(counter) + matrix1 = load_embeddings(vocab1, 'wiki.en') + ft = get_fasttext_model('wiki.en') + np.testing.assert_almost_equal(matrix1[vocab1["hello"]], ft['hello'], decimal=4) + with pytest.raises(ValueError): + get_fasttext_model('wiki.multi.ar') + From 4d43f82f8f1a9dfa2f7550d20bcc152c13803798 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Mon, 27 Jul 2020 20:21:00 -0700 Subject: [PATCH 2/4] add subversion/wget to docker, add readme (#1279) --- tools/batch/docker/Dockerfile | 2 ++ tools/batch/docker/README.md | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tools/batch/docker/README.md diff --git a/tools/batch/docker/Dockerfile b/tools/batch/docker/Dockerfile index d2868239b3..a9ef4aaad4 100644 --- a/tools/batch/docker/Dockerfile +++ b/tools/batch/docker/Dockerfile @@ -4,6 +4,8 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 build-essential \ locales \ cmake \ + wget \ + subversion \ git \ curl \ vim \ diff --git a/tools/batch/docker/README.md b/tools/batch/docker/README.md new file mode 100644 index 0000000000..80efb0d9d1 --- /dev/null +++ b/tools/batch/docker/README.md @@ -0,0 +1,22 @@ +# Updating the Docker for AWS Batch. + +Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To +update the docker: +- update the Dockerfile +- Make sure docker and docker-compose, as well as the docker python package are installed. +- Export the AWS account credentials as environment variables +- CD to the same folder as the Dockerfile and execute the following: + +``` +# this executes a command that logs into ECR. +$(aws ecr get-login --no-include-email --region us-east-1) + +# builds the Dockerfile as gluon-nlp-1 docker. +docker build -t gluon-nlp-1 . + +# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from. +docker tag gluon-nlp-1:latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest + +# pushes the change +docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest +``` From 3c874575bf40e8b1fa2280371131a8f29ebb3e98 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 28 Jul 2020 18:03:21 -0700 Subject: [PATCH 3/4] Add layout + compute_layout support: TransformerNMT, BERT, ALBERT, ELECTRA, MobileBERT, RoBERTA, XLMR (#1258) * Add layout support * fix test * Update transformer.py * Update transformer.py * Update README.md * try to add set_layout * update test case * fix * update * update * update * Update bert.py * fix bug * update * Update test_models_bert.py * Update tokenizers.py * add compute layout * Update xlmr.py * Update test_models_bert.py * revise test cases * Update layers.py * move jieba to try import * fix * Update transformer.py * fix * Update bert.py * Update setup.py * Update test_models_bert.py * Update test_models_bert.py * fix * update * Revise * Update electra.py * Update electra.py * Update test_models_electra.py * fix * fix bug * Update test_models_albert.py * add more testcases * fix * Update albert.py * Update albert.py * fix bug * fix testcase * Update test_models_electra.py * Update bert.py * update * Update test_models_electra.py * Update mobilebert.py * Update mobilebert.py * update mobilebert * Update test_models_mobilebert.py * Update mobilebert.py * fix bug * Update roberta.py * fix roberta * update * update * fix import * fix bug * update * reduce test workloads * address comment * address comment --- README.md | 9 +- scripts/conversion_toolkits/README.md | 3 +- setup.py | 2 + src/gluonnlp/attention_cell.py | 79 +++-- src/gluonnlp/data/tokenizers.py | 20 +- src/gluonnlp/layers.py | 5 +- src/gluonnlp/models/albert.py | 318 ++++++++++++++----- src/gluonnlp/models/bert.py | 364 ++++++++++++++++----- src/gluonnlp/models/electra.py | 424 +++++++++++++++++++------ src/gluonnlp/models/mobilebert.py | 434 ++++++++++++++++++-------- src/gluonnlp/models/roberta.py | 309 +++++++++++------- src/gluonnlp/models/transformer.py | 430 ++++++++++++++++++------- src/gluonnlp/models/transformer_xl.py | 9 +- src/gluonnlp/models/xlmr.py | 52 +-- src/gluonnlp/utils/testing.py | 152 ++++++--- tests/test_attention_cell.py | 51 ++- tests/test_models_albert.py | 68 +++- tests/test_models_bert.py | 78 ++++- tests/test_models_electra.py | 59 +++- tests/test_models_mobilebert.py | 78 ++++- tests/test_models_roberta.py | 54 ++++ tests/test_models_transformer.py | 43 ++- tests/test_models_xlmr.py | 4 +- 23 files changed, 2280 insertions(+), 765 deletions(-) diff --git a/README.md b/README.md index 34fc069cbc..65b877451a 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,17 @@ This is a work-in-progress. First of all, install the latest MXNet. You may use the following commands: ```bash +# Install the version with CUDA 10.0 +pip install -U --pre "mxnet-cu100>=2.0.0b20200716" -f https://dist.mxnet.io/python # Install the version with CUDA 10.1 -pip install -U --pre mxnet-cu101>=2.0.0b20200716 -f https://dist.mxnet.io/python +pip install -U --pre "mxnet-cu101>=2.0.0b20200716" -f https://dist.mxnet.io/python + +# Install the version with CUDA 10.2 +pip install -U --pre "mxnet-cu102>=2.0.0b20200716" -f https://dist.mxnet.io/python # Install the cpu-only version -pip install -U --pre mxnet>=2.0.0b20200716 -f https://dist.mxnet.io/python +pip install -U --pre "mxnet>=2.0.0b20200716" -f https://dist.mxnet.io/python ``` diff --git a/scripts/conversion_toolkits/README.md b/scripts/conversion_toolkits/README.md index be8bc8eff3..2c29e87db7 100644 --- a/scripts/conversion_toolkits/README.md +++ b/scripts/conversion_toolkits/README.md @@ -75,8 +75,7 @@ Notice: pleas set up the `--electra_path` with the cloned path or get this elect ```bash # Need to use TF 1.13.2 to use contrib layer -pip uninstall tensorflow -pip install tensorflow==1.13.2 +pip install tensorflow==1.13.2 --upgrade --force-reinstall # Actual conversion bash convert_electra.sh diff --git a/setup.py b/setup.py index 29cbc0c029..3de80f5695 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,8 @@ def find_version(*file_paths): 'scripts', )), package_dir={"": "src"}, + package_data={'': [os.path.join('models', 'model_zoo_checksums', '*.txt'), + os.path.join('cli', 'data', 'url_checksums', '*.txt')]}, zip_safe=True, include_package_data=True, install_requires=requirements, diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py index c5288ae087..4773f81d46 100644 --- a/src/gluonnlp/attention_cell.py +++ b/src/gluonnlp/attention_cell.py @@ -33,7 +33,8 @@ def gen_self_attn_mask(F, data, valid_length=None, dtype: type = np.float32, - attn_type: str = 'full'): + attn_type: str = 'full', + layout: str = 'NT'): """Generate the mask used for the encoder, i.e, self-attention. In our implementation, 1 --> not masked, 0 --> masked @@ -100,25 +101,37 @@ def gen_self_attn_mask(F, data, Parameters ---------- - F : - data : - The data. Shape (batch_size, seq_length, C) - valid_length : + F + data + The data. + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) + valid_length Shape (batch_size,) dtype Data type of the mask - attn_type : str + attn_type Can be 'full' or 'causal' + layout + The layout of the data Returns ------- mask Shape (batch_size, seq_length, seq_length) """ + if layout == 'NT': + batch_axis, time_axis = 0, 1 + elif layout == 'TN': + batch_axis, time_axis = 1, 0 + else: + raise NotImplementedError('Unsupported layout={}'.format(layout)) if attn_type == 'full': if valid_length is not None: valid_length = valid_length.astype(dtype) - steps = F.npx.arange_like(data, axis=1) # (seq_length,) + steps = F.npx.arange_like(data, axis=time_axis) # (seq_length,) mask1 = (F.npx.reshape(steps, (1, 1, -1)) < F.npx.reshape(valid_length, (-2, 1, 1))) mask2 = (F.npx.reshape(steps, (1, -1, 1)) @@ -126,12 +139,12 @@ def gen_self_attn_mask(F, data, mask = mask1 * mask2 else: # TODO(sxjscience) optimize - seq_len_ones = F.np.ones_like(F.npx.arange_like(data, axis=1)) # (seq_length,) - batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=0)) # (batch_size,) + seq_len_ones = F.np.ones_like(F.npx.arange_like(data, axis=time_axis)) # (seq_length,) + batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=batch_axis)) # (batch_size,) mask = batch_ones.reshape((-1, 1, 1)) * seq_len_ones.reshape((1, -1, 1))\ * seq_len_ones.reshape((1, 1, -1)) elif attn_type == 'causal': - steps = F.npx.arange_like(data, axis=1) + steps = F.npx.arange_like(data, axis=time_axis) # mask: (seq_length, seq_length) # batch_mask: (batch_size, seq_length) mask = (F.np.expand_dims(steps, axis=0) <= F.np.expand_dims(steps, axis=1)).astype(dtype) @@ -140,7 +153,8 @@ def gen_self_attn_mask(F, data, batch_mask = (F.np.expand_dims(steps, axis=0) < F.np.expand_dims(valid_length, axis=-1)).astype(dtype) mask = mask * F.np.expand_dims(batch_mask, axis=-1) else: - batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=0), dtype=np.float32) # (batch_size,) + batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=batch_axis), + dtype=dtype) # (batch_size,) mask = mask * batch_ones.reshape((-1, 1, 1)) else: raise NotImplementedError @@ -148,7 +162,8 @@ def gen_self_attn_mask(F, data, return mask -def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None, dtype=np.float32): +def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None, + dtype=np.float32, layout: str = 'NT'): """Generate the mask used for the decoder. All query slots are attended to the memory slots. In our implementation, 1 --> not masked, 0 --> masked @@ -183,34 +198,48 @@ def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None, dt Parameters ---------- F : - mem : - Shape (batch_size, mem_length, C_mem) + mem + - layout = 'NT' + Shape (batch_size, mem_length, C_mem) + - layout = 'TN' + Shape (mem_length, batch_size, C_mem) mem_valid_length : Shape (batch_size,) - data : - Shape (batch_size, query_length, C_data) + data + - layout = 'NT' + Shape (batch_size, query_length, C_data) + - layout = 'TN' + Shape (query_length, batch_size, C_data) data_valid_length : Shape (batch_size,) - dtype : type + dtype Data type of the mask + layout + Layout of the data + mem tensor Returns ------- mask : Shape (batch_size, query_length, mem_length) """ + if layout == 'NT': + batch_axis, time_axis = 0, 1 + elif layout == 'TN': + batch_axis, time_axis = 1, 0 + else: + raise NotImplementedError('Unsupported layout={}'.format(layout)) mem_valid_length = mem_valid_length.astype(dtype) - mem_steps = F.npx.arange_like(mem, axis=1) # (mem_length,) + mem_steps = F.npx.arange_like(mem, axis=time_axis) # (mem_length,) + data_steps = F.npx.arange_like(data, axis=time_axis) # (query_length,) mem_mask = (F.npx.reshape(mem_steps, (1, 1, -1)) < F.npx.reshape(mem_valid_length, (-2, 1, 1))).astype(dtype) # (B, 1, mem_length) if data_valid_length is not None: data_valid_length = data_valid_length.astype(dtype) - data_steps = F.npx.arange_like(data, axis=1) # (query_length,) data_mask = (F.npx.reshape(data_steps, (1, -1, 1)) < F.npx.reshape(data_valid_length, (-2, 1, 1))).astype(dtype) # (B, query_length, 1) mask = mem_mask * data_mask else: - query_length_ones = F.np.ones_like(F.npx.arange_like(data, axis=1)) # (query_length,) + query_length_ones = F.np.ones_like(data_steps) mask = query_length_ones.reshape((1, -1, 1)) * mem_mask return mask @@ -594,6 +623,7 @@ def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0, self._normalized = normalized self._eps = eps self._dtype = dtype + assert layout in ['NTK', 'NKT', 'TNK'] self._layout = layout self._use_einsum = use_einsum if self._query_units is not None: @@ -604,6 +634,10 @@ def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0, else: self._query_head_units = None + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, query, key, value, mask=None, edge_scores=None): return multi_head_dot_attn(F, query=query, key=key, value=value, mask=mask, edge_scores=edge_scores, @@ -764,6 +798,11 @@ def __init__(self, query_units, else: raise NotImplementedError('method="{}" is currently not supported!'.format(method)) + @property + def layout(self) -> str: + """Layout of the cell""" + return self._layout + def hybrid_forward(self, F, rel_positions, query=None): """ diff --git a/src/gluonnlp/data/tokenizers.py b/src/gluonnlp/data/tokenizers.py index a7aa40ee7b..d9579b2d55 100644 --- a/src/gluonnlp/data/tokenizers.py +++ b/src/gluonnlp/data/tokenizers.py @@ -26,21 +26,20 @@ import json from collections import OrderedDict import abc -import sys import warnings import itertools from typing import NewType import sacremoses -import jieba from uuid import uuid4 from .vocab import Vocab from ..registry import TOKENIZER_REGISTRY -from ..utils.lazy_imports import try_import_subword_nmt, \ - try_import_sentencepiece, \ - try_import_huggingface_tokenizers, \ - try_import_yttm, \ - try_import_spacy, \ - try_import_jieba +from ..utils.lazy_imports import try_import_subword_nmt,\ + try_import_sentencepiece,\ + try_import_huggingface_tokenizers,\ + try_import_yttm,\ + try_import_spacy,\ + try_import_jieba + SentencesType = NewType('SentencesType', Union[str, List[str]]) TokensType = NewType('TokensType', Union[List[str], List[List[str]]]) @@ -553,10 +552,10 @@ class JiebaTokenizer(BaseTokenizerWithVocab): """ - def __init__(self, ditionary=None, vocab: Optional[Vocab] = None): + def __init__(self, dictionary=None, vocab: Optional[Vocab] = None): self._vocab = vocab jieba = try_import_jieba() - self._tokenizer = jieba.Tokenizer(ditionary) + self._tokenizer = jieba.Tokenizer(dictionary) self._tokenizer.initialize(self._tokenizer.dictionary) def encode(self, sentences, output_type=str): @@ -626,6 +625,7 @@ def __getstate__(self): return d def __setstate__(self, state): + jieba = try_import_jieba() self._tokenizer = jieba.Tokenizer() for k, v in state.items(): setattr(self._tokenizer, k, v) diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py index f19553fd5e..a6ea6b181e 100644 --- a/src/gluonnlp/layers.py +++ b/src/gluonnlp/layers.py @@ -356,9 +356,10 @@ def __init__(self, mode='erf'): def hybrid_forward(self, F, x): if self._mode == 'erf': - return x * 0.5 * (1.0 + F.npx.erf(x / math.sqrt(2.0))) + return F.npx.leaky_relu(x, act_type='gelu') elif self._mode == 'tanh': - return 0.5 * x * (1.0 + F.np.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * (x ** 3)))) + return 0.5 * x\ + * (1.0 + F.np.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * (x ** 3)))) elif self._mode == 'sigmoid': return x * F.npx.sigmoid(1.702 * x) else: diff --git a/src/gluonnlp/models/albert.py b/src/gluonnlp/models/albert.py index 1eb504c643..1b4efa16e2 100644 --- a/src/gluonnlp/models/albert.py +++ b/src/gluonnlp/models/albert.py @@ -25,7 +25,8 @@ """ __all__ = ['AlbertModel', 'AlbertForMLM', 'AlbertForPretrain', - 'list_pretrained_albert', 'get_pretrained_albert'] + 'list_pretrained_albert', 'get_pretrained_albert', + 'albert_cfg_reg'] import os from typing import Tuple @@ -38,16 +39,89 @@ from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir from ..utils.config import CfgNode as CN from ..utils.misc import load_checksum_stats, download +from ..utils.registry import Registry from ..initializer import TruncNorm from ..attention_cell import gen_self_attn_mask from ..layers import get_activation, PositionalEmbedding from ..op import select_vectors_by_position from ..data.tokenizers import SentencepieceTokenizer +albert_cfg_reg = Registry('albert_cfg') + + +@albert_cfg_reg.register() +def google_albert_base(): + cfg = CN() + # Model Parameters + cfg.MODEL = CN() + cfg.MODEL.vocab_size = 30000 + cfg.MODEL.embed_size = 128 + cfg.MODEL.units = 768 + cfg.MODEL.hidden_size = 3072 + cfg.MODEL.max_length = 512 + cfg.MODEL.num_heads = 12 + cfg.MODEL.num_layers = 12 + cfg.MODEL.pos_embed_type = 'learned' + cfg.MODEL.activation = 'gelu(tanh)' + cfg.MODEL.layer_norm_eps = 1E-12 + cfg.MODEL.num_groups = 1 + cfg.MODEL.num_token_types = 2 + cfg.MODEL.hidden_dropout_prob = 0.0 + cfg.MODEL.attention_dropout_prob = 0.0 + cfg.MODEL.dtype = 'float32' + cfg.MODEL.layout = 'NT' + cfg.MODEL.compute_layout = 'auto' + # Hyper-parameters of the Initializers + cfg.INITIALIZER = CN() + cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] + cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) + cfg.INITIALIZER.bias = ['zeros'] + # Version of the model. This helps ensure backward compatibility. + # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26 + cfg.VERSION = 1 + cfg.freeze() + return cfg + + +@albert_cfg_reg.register() +def google_albert_large(): + cfg = google_albert_base() + cfg.defrost() + cfg.MODEL.hidden_size = 4096 + cfg.MODEL.num_heads = 16 + cfg.MODEL.num_layers = 24 + cfg.MODEL.units = 1024 + cfg.freeze() + return cfg + + +@albert_cfg_reg.register() +def google_albert_xlarge(): + cfg = google_albert_base() + cfg.defrost() + cfg.MODEL.hidden_size = 8192 + cfg.MODEL.num_heads = 32 + cfg.MODEL.num_layers = 24 + cfg.MODEL.units = 2048 + cfg.freeze() + return cfg + + +@albert_cfg_reg.register() +def google_albert_xxlarge(): + cfg = google_albert_base() + cfg.defrost() + cfg.MODEL.hidden_size = 16384 + cfg.MODEL.num_heads = 64 + cfg.MODEL.num_layers = 12 + cfg.MODEL.units = 4096 + cfg.freeze() + return cfg + PRETRAINED_URL = { 'google_albert_base_v2': { - 'cfg': 'google_albert_base_v2/model-8767fdc9.yml', + 'cfg': google_albert_base(), 'spm_model': 'google_albert_base_v2/spm-65999e5d.model', 'vocab': 'google_albert_base_v2/vocab-2ee53ae7.json', 'params': 'google_albert_base_v2/model-125be477.params', @@ -55,7 +129,7 @@ 'lowercase': True, }, 'google_albert_large_v2': { - 'cfg': 'google_albert_large_v2/model-e2e9b974.yml', + 'cfg': google_albert_large(), 'spm_model': 'google_albert_large_v2/spm-65999e5d.model', 'vocab': 'google_albert_large_v2/vocab-2ee53ae7.json', 'params': 'google_albert_large_v2/model-ad60bcd5.params', @@ -63,7 +137,7 @@ 'lowercase': True, }, 'google_albert_xlarge_v2': { - 'cfg': 'google_albert_xlarge_v2/model-8123bffd.yml', + 'cfg': google_albert_xlarge(), 'spm_model': 'google_albert_xlarge_v2/spm-65999e5d.model', 'vocab': 'google_albert_xlarge_v2/vocab-2ee53ae7.json', 'params': 'google_albert_xlarge_v2/model-4149c9e2.params', @@ -71,7 +145,7 @@ 'lowercase': True, }, 'google_albert_xxlarge_v2': { - 'cfg': 'google_albert_xxlarge_v2/model-07fbeebc.yml', + 'cfg': google_albert_xxlarge(), 'spm_model': 'google_albert_xxlarge_v2/spm-65999e5d.model', 'vocab': 'google_albert_xxlarge_v2/vocab-2ee53ae7.json', 'params': 'google_albert_xxlarge_v2/model-5601a0ed.params', @@ -97,7 +171,8 @@ def __init__(self, units=512, hidden_size=2048, layer_norm_eps=1E-12, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', - activation='gelu'): + activation='gelu', + layout='NT'): super().__init__() assert units % num_heads == 0,\ 'In AlbertEncoder, The units should be divided exactly ' \ @@ -112,6 +187,8 @@ def __init__(self, units=512, hidden_size=2048, self._output_attention = output_attention self._output_all_encodings = output_all_encodings + self._layout = layout + self.all_encoder_groups = nn.HybridSequential() for group_idx in range(num_groups): @@ -124,7 +201,13 @@ def __init__(self, units=512, hidden_size=2048, layer_norm_eps=layer_norm_eps, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - activation=activation)) + activation=activation, + dtype=dtype, + layout=layout)) + + @property + def layout(self): + return self._layout def hybrid_forward(self, F, data, valid_length): """ @@ -135,18 +218,26 @@ def hybrid_forward(self, F, data, valid_length): Parameters ---------- F - data : - Shape (batch_size, seq_length, C) + data + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) valid_length : Shape (batch_size,) Returns ------- - out : - Shape (batch_size, seq_length, C_out) + out + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C) """ # 1. Embed the data - attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full') + time_axis = 1 if self.layout == 'NT' else 0 + attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, + attn_type='full', layout=self.layout) out = data all_encodings_outputs = [] additional_outputs = [] @@ -159,7 +250,8 @@ def hybrid_forward(self, F, data, valid_length): if self._output_all_encodings: out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, + axis=time_axis) all_encodings_outputs.append(out) if self._output_attention: @@ -168,7 +260,8 @@ def hybrid_forward(self, F, data, valid_length): if not self._output_all_encodings: # if self._output_all_encodings, SequenceMask is already applied above out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, + axis=time_axis) return out, additional_outputs else: return all_encodings_outputs, additional_outputs @@ -195,7 +288,9 @@ def __init__(self, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', dtype='float32', - use_pooler=True): + use_pooler=True, + layout='NT', + compute_layout='auto'): super().__init__() self._dtype = dtype self.use_pooler = use_pooler @@ -210,6 +305,11 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps + self._layout = layout + if compute_layout is None or compute_layout == 'auto': + self._compute_layout = layout + else: + self._compute_layout = compute_layout # Construct AlbertEncoder self.encoder = AlbertEncoder( units=units, @@ -226,6 +326,7 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=dtype, + layout=self._compute_layout ) self.encoder.hybridize() # Construct word embedding @@ -257,6 +358,10 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer) + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ """Generate the representation given the inputs. @@ -266,10 +371,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. @@ -279,8 +390,11 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): Returns ------- - contextual_embedding : - Shape (batch_size, seq_length, units). + contextual_embedding + - layout = 'NT' + Shape (batch_size, seq_length, units) + - layout = 'TN' + Shape (seq_length, batch_size, units) pooled_output : This is optional. Shape (batch_size, units) """ @@ -290,7 +404,13 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): if self.embed_size != self.units: prev_out = self.embed_factorized_proj(prev_out) outputs = [] - contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length) + if self._compute_layout != self._layout: + # Swap input to reflect the compute_layout + contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1), + valid_length) + contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1) + else: + contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length) outputs.append(contextual_embeddings) if self.use_pooler: pooled_out = self.apply_pooling(contextual_embeddings) @@ -304,24 +424,37 @@ def get_initial_embedding(self, F, inputs, token_types=None): ---------- F inputs - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) token_types - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' If None, it will be initialized as all zero Returns ------- embedding The initial embedding that will be fed into the encoder + - layout = 'NT' + Shape (batch_size, seq_length, C_embed) + - layout = 'TN' + Shape (seq_length, batch_size, C_embed) """ + if self.layout == 'NT': + batch_axis, time_axis = 0, 1 + else: + batch_axis, time_axis = 1, 0 embedding = self.word_embed(inputs) if token_types is None: token_types = F.np.zeros_like(inputs) type_embedding = self.token_type_embed(token_types) embedding = embedding + type_embedding if self.pos_embed_type is not None: - positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=1)) - positional_embedding = F.np.expand_dims(positional_embedding, axis=0) + positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis)) + positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis) embedding = embedding + positional_embedding # Extra layer normalization plus dropout embedding = self.embed_layer_norm(embedding) @@ -334,50 +467,34 @@ def apply_pooling(self, sequence): This is used for pre-training or fine-tuning a Bert model. Get the first token of the whole sequence which is [CLS] - sequence: - Shape (batch_size, sequence_length, units) - return: + Parameters + ---------- + sequence + - layout = 'NT' + Shape (batch_size, sequence_length, units) + - layout = 'TN' + Shape (sequence_length, batch_size, units) + + Returns + ------- + pooled_out Shape (batch_size, units) """ - outputs = sequence[:, 0, :] + if self.layout == 'NT': + outputs = sequence[:, 0, :] + else: + outputs = sequence[0, :, :] return self.pooler(outputs) @staticmethod def get_cfg(key=None): - if key is None: - cfg = CN() - # Model Parameters - cfg.MODEL = CN() - cfg.MODEL.vocab_size = 30000 - cfg.MODEL.embed_size = 128 - cfg.MODEL.units = 768 - cfg.MODEL.hidden_size = 3072 - cfg.MODEL.max_length = 512 - cfg.MODEL.num_heads = 12 - cfg.MODEL.num_layers = 12 - cfg.MODEL.pos_embed_type = 'learned' - cfg.MODEL.activation = 'gelu' - cfg.MODEL.layer_norm_eps = 1E-12 - cfg.MODEL.num_groups = 1 - cfg.MODEL.num_token_types = 2 - cfg.MODEL.hidden_dropout_prob = 0.0 - cfg.MODEL.attention_dropout_prob = 0.0 - cfg.MODEL.dtype = 'float32' - # Hyper-parameters of the Initializers - cfg.INITIALIZER = CN() - cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] - cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) - cfg.INITIALIZER.bias = ['zeros'] - # Version of the model. This helps ensure backward compatibility. - # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26 - cfg.VERSION = 1 + if key is not None: + return albert_cfg_reg.create(key) else: - raise NotImplementedError - cfg.freeze() - return cfg + return google_albert_base() @classmethod - def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel': + def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'AlbertModel': """ Parameters @@ -385,6 +502,8 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel': cfg use_pooler Whether to use pooler + dtype + The dtype of the backbone model Returns ------- @@ -396,6 +515,8 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel': embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) bias_initializer = mx.init.create(*cfg.INITIALIZER.bias) + if dtype is None: + dtype = cfg.MODEL.dtype return cls(vocab_size=cfg.MODEL.vocab_size, units=cfg.MODEL.units, hidden_size=cfg.MODEL.hidden_size, @@ -411,6 +532,7 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'AlbertModel': activation=cfg.MODEL.activation, layer_norm_eps=cfg.MODEL.layer_norm_eps, dtype=dtype, + layout=cfg.MODEL.layout, embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, @@ -453,6 +575,10 @@ def __init__(self, backbone_cfg, self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight self.mlm_decoder.hybridize() + @property + def layout(self): + return self.backbone_model.layout + def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): """Getting the scores of the masked positions. @@ -460,10 +586,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) The type of the token. For example, if the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. valid_length : @@ -476,14 +608,21 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units) + - layout = 'TN' + Shape (seq_length, batch_size, units) pooled_out Shape (batch_size, units) mlm_scores : Shape (batch_size, num_masked_positions, vocab_size) """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.layout == 'NT': + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + else: + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) mlm_scores = self.mlm_decoder(mlm_features) return contextual_embeddings, pooled_out, mlm_scores @@ -528,6 +667,10 @@ def __init__(self, backbone_cfg, self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight self.mlm_decoder.hybridize() + @property + def layout(self): + return self.backbone_model.layout + def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): """Generate the representation given the inputs. @@ -537,10 +680,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) token_types : - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. @@ -554,7 +703,10 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) sop_score : @@ -564,7 +716,11 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) sop_score = self.sop_classifier(pooled_out) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.layout == 'NT': + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + else: + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) mlm_scores = self.mlm_decoder(mlm_features) return contextual_embeddings, pooled_out, sop_score, mlm_scores @@ -604,15 +760,22 @@ def get_pretrained_albert(model_name: str = 'google_albert_base_v2', assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format( model_name, list_pretrained_albert()) cfg_path = PRETRAINED_URL[model_name]['cfg'] + if isinstance(cfg_path, CN): + cfg = cfg_path + else: + cfg = None spm_model_path = PRETRAINED_URL[model_name]['spm_model'] vocab_path = PRETRAINED_URL[model_name]['vocab'] params_path = PRETRAINED_URL[model_name]['params'] mlm_params_path = PRETRAINED_URL[model_name]['mlm_params'] local_paths = dict() - for k, path in [('cfg', cfg_path), ('spm_model', spm_model_path), ('vocab', vocab_path)]: - local_paths[k] = download(url=get_repo_model_zoo_url() + path, - path=os.path.join(root, path), - sha1_hash=FILE_STATS[path]) + download_jobs = [('spm_model', spm_model_path), ('vocab', vocab_path)] + if cfg is None: + download_jobs.append(('cfg', cfg_path)) + for key, path in download_jobs: + local_paths[key] = download(url=get_repo_model_zoo_url() + path, + path=os.path.join(root, path), + sha1_hash=FILE_STATS[path]) if load_backbone: local_params_path = download(url=get_repo_model_zoo_url() + params_path, path=os.path.join(root, params_path), @@ -630,7 +793,8 @@ def get_pretrained_albert(model_name: str = 'google_albert_base_v2', tokenizer = SentencepieceTokenizer(local_paths['spm_model'], vocab=local_paths['vocab'], lowercase=do_lower) - cfg = AlbertModel.get_cfg().clone_merge(local_paths['cfg']) + if cfg is None: + cfg = AlbertModel.get_cfg().clone_merge(local_paths['cfg']) return cfg, tokenizer, local_params_path, local_mlm_params_path diff --git a/src/gluonnlp/models/bert.py b/src/gluonnlp/models/bert.py index fd53ae3b5c..84a1d5ee2e 100644 --- a/src/gluonnlp/models/bert.py +++ b/src/gluonnlp/models/bert.py @@ -39,16 +39,108 @@ from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir from ..utils.config import CfgNode as CN from ..utils.misc import load_checksum_stats, download +from ..utils.registry import Registry from ..initializer import TruncNorm from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask from ..layers import get_activation, PositionalEmbedding, PositionwiseFFN, InitializerType from ..op import select_vectors_by_position from ..data.tokenizers import HuggingFaceWordPieceTokenizer +bert_cfg_reg = Registry('bert_cfg') + + +@bert_cfg_reg.register() +def google_en_uncased_bert_base(): + cfg = CN() + # Parameters for thr small model + cfg.MODEL = CN() + cfg.MODEL.vocab_size = 30522 + cfg.MODEL.units = 768 + cfg.MODEL.hidden_size = 3072 + cfg.MODEL.max_length = 512 + cfg.MODEL.num_heads = 12 + cfg.MODEL.num_layers = 12 + cfg.MODEL.pos_embed_type = 'learned' + cfg.MODEL.activation = 'gelu' + cfg.MODEL.layer_norm_eps = 1E-12 + cfg.MODEL.num_token_types = 2 + cfg.MODEL.hidden_dropout_prob = 0.1 + cfg.MODEL.attention_dropout_prob = 0.1 + cfg.MODEL.dtype = 'float32' + cfg.MODEL.layout = 'NT' + cfg.MODEL.compute_layout = 'auto' + # Hyper-parameters of the Initializers + cfg.INITIALIZER = CN() + cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] + cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) + cfg.INITIALIZER.bias = ['zeros'] + # Version of the model. This helps ensure backward compatibility. + # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26 + cfg.VERSION = 1 + cfg.freeze() + return cfg + + +@bert_cfg_reg.register() +def google_en_uncased_bert_large(): + cfg = google_en_uncased_bert_base() + cfg.defrost() + cfg.MODEL.hidden_size = 4096 + cfg.MODEL.num_heads = 16 + cfg.MODEL.num_layers = 24 + cfg.MODEL.units = 1024 + cfg.freeze() + return cfg + + +@bert_cfg_reg.register() +def google_en_cased_bert_base(): + cfg = google_en_uncased_bert_base() + cfg.defrost() + cfg.MODEL.vocab_size = 28996 + cfg.freeze() + return cfg + + +@bert_cfg_reg.register() +def google_en_cased_bert_large(): + cfg = google_en_uncased_bert_large() + cfg.defrost() + cfg.MODEL.vocab_size = 28996 + cfg.freeze() + return cfg + + +@bert_cfg_reg.register() +def google_zh_bert_base(): + cfg = google_en_uncased_bert_base() + cfg.defrost() + cfg.MODEL.vocab_size = 21128 + cfg.freeze() + return cfg + + +@bert_cfg_reg.register() +def google_multi_cased_bert_base(): + cfg = google_en_uncased_bert_base() + cfg.defrost() + cfg.MODEL.vocab_size = 119547 + cfg.freeze() + return cfg + + +@bert_cfg_reg.register() +def google_multi_cased_bert_large(): + cfg = google_en_uncased_bert_large() + cfg.defrost() + cfg.MODEL.vocab_size = 119547 + cfg.freeze() + return cfg + PRETRAINED_URL = { 'google_en_cased_bert_base': { - 'cfg': 'google_en_cased_bert_base/model-5620839a.yml', + 'cfg': google_en_cased_bert_base(), 'vocab': 'google_en_cased_bert_base/vocab-c1defaaa.json', 'params': 'google_en_cased_bert_base/model-c566c289.params', 'mlm_params': 'google_en_cased_bert_base/model_mlm-bde14bee.params', @@ -56,49 +148,49 @@ }, 'google_en_uncased_bert_base': { - 'cfg': 'google_en_uncased_bert_base/model-4d8422ad.yml', + 'cfg': google_en_uncased_bert_base(), 'vocab': 'google_en_uncased_bert_base/vocab-e6d2b21d.json', 'params': 'google_en_uncased_bert_base/model-3712e50a.params', 'mlm_params': 'google_en_uncased_bert_base/model_mlm-04e88b58.params', 'lowercase': True, }, 'google_en_cased_bert_large': { - 'cfg': 'google_en_cased_bert_large/model-9e127fee.yml', + 'cfg': google_en_cased_bert_large(), 'vocab': 'google_en_cased_bert_large/vocab-c1defaaa.json', 'params': 'google_en_cased_bert_large/model-7aa93704.params', 'mlm_params': 'google_en_cased_bert_large/model_mlm-59ff3f6a.params', 'lowercase': False, }, 'google_en_uncased_bert_large': { - 'cfg': 'google_en_uncased_bert_large/model-d0c37dcc.yml', + 'cfg': google_en_uncased_bert_large(), 'vocab': 'google_en_uncased_bert_large/vocab-e6d2b21d.json', 'params': 'google_en_uncased_bert_large/model-e53bbc57.params', 'mlm_params': 'google_en_uncased_bert_large/model_mlm-44bc70c0.params', 'lowercase': True, }, 'google_zh_bert_base': { - 'cfg': 'google_zh_bert_base/model-9b16bda6.yml', + 'cfg': google_zh_bert_base(), 'vocab': 'google_zh_bert_base/vocab-711c13e4.json', 'params': 'google_zh_bert_base/model-2efbff63.params', 'mlm_params': 'google_zh_bert_base/model_mlm-75339658.params', 'lowercase': False, }, 'google_multi_cased_bert_base': { - 'cfg': 'google_multi_cased_bert_base/model-881ad607.yml', + 'cfg': google_multi_cased_bert_base(), 'vocab': 'google_multi_cased_bert_base/vocab-016e1169.json', 'params': 'google_multi_cased_bert_base/model-c2110078.params', 'mlm_params': 'google_multi_cased_bert_base/model_mlm-4611e7a3.params', 'lowercase': False, }, 'google_en_cased_bert_wwm_large': { - 'cfg': 'google_en_cased_bert_wwm_large/model-9e127fee.yml', + 'cfg': google_en_cased_bert_large(), 'vocab': 'google_en_cased_bert_wwm_large/vocab-c1defaaa.json', 'params': 'google_en_cased_bert_wwm_large/model-0fe841cf.params', 'mlm_params': None, 'lowercase': False, }, 'google_en_uncased_bert_wwm_large': { - 'cfg': 'google_en_uncased_bert_wwm_large/model-d0c37dcc.yml', + 'cfg': google_en_uncased_bert_large(), 'vocab': 'google_en_uncased_bert_wwm_large/vocab-e6d2b21d.json', 'params': 'google_en_uncased_bert_wwm_large/model-cb3ad3c2.params', 'mlm_params': None, @@ -124,7 +216,8 @@ def __init__(self, units: int = 512, layer_norm_eps: float = 1E-12, weight_initializer: InitializerType = TruncNorm(stdev=0.02), bias_initializer: InitializerType = 'zeros', - activation='gelu'): + activation='gelu', + layout='NT'): super().__init__() assert units % num_heads == 0,\ 'In BertTransformer, The units should be divided exactly ' \ @@ -135,6 +228,7 @@ def __init__(self, units: int = 512, self._num_layers = num_layers self._output_attention = output_attention self._output_all_encodings = output_all_encodings + self._layout = layout self.all_layers = nn.HybridSequential() for layer_idx in range(num_layers): @@ -147,7 +241,13 @@ def __init__(self, units: int = 512, layer_norm_eps=layer_norm_eps, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - activation=activation)) + activation=activation, + layout=layout, + dtype=dtype)) + + @property + def layout(self): + return self._layout def hybrid_forward(self, F, data, valid_length): """ @@ -158,30 +258,41 @@ def hybrid_forward(self, F, data, valid_length): Parameters ---------- F - data : - Shape (batch_size, seq_length, C) - valid_length : + data + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) + valid_length Shape (batch_size,) Returns ------- - out : - Shape (batch_size, seq_length, C_out) + out + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) """ + if self.layout == 'NT': + time_axis, batch_axis = 1, 0 + else: + time_axis, batch_axis = 0, 1 # 1. Embed the data - attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full') + attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, + attn_type='full', layout=self.layout) out = data all_encodings_outputs = [] additional_outputs = [] for layer_idx in range(self._num_layers): layer = self.all_layers[layer_idx] out, attention_weights = layer(out, attn_mask) - # out : [batch_size, seq_len, units] + # out : [batch_size, seq_len, units] or [seq_len, batch_size, units] # attention_weights : [batch_size, num_heads, seq_len, seq_len] if self._output_all_encodings: out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, axis=time_axis) all_encodings_outputs.append(out) if self._output_attention: @@ -190,7 +301,7 @@ def hybrid_forward(self, F, data, valid_length): if not self._output_all_encodings: # if self._output_all_encodings, SequenceMask is already applied above out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, axis=time_axis) return out, additional_outputs else: return all_encodings_outputs, additional_outputs @@ -215,7 +326,9 @@ def __init__(self, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', dtype='float32', - use_pooler=True): + use_pooler=True, + layout='NT', + compute_layout='auto'): super().__init__() self._dtype = dtype self.use_pooler = use_pooler @@ -229,6 +342,11 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps + self._layout = layout + if compute_layout is None or compute_layout == 'auto': + self._compute_layout = layout + else: + self._compute_layout = compute_layout # Construct BertTransformer self.encoder = BertTransformer( units=units, @@ -244,6 +362,7 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=dtype, + layout=self._compute_layout ) self.encoder.hybridize() # Construct word embedding @@ -270,6 +389,10 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer) + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, inputs, token_types, valid_length): # pylint: disable=arguments-differ """Generate the representation given the inputs. @@ -279,10 +402,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length): Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (batch_size, seq_length) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. @@ -292,16 +421,24 @@ def hybrid_forward(self, F, inputs, token_types, valid_length): Returns ------- - contextual_embedding : - Shape (batch_size, seq_length, units). + contextual_embedding + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_output : This is optional. Shape (batch_size, units) """ initial_embedding = self.get_initial_embedding(F, inputs, token_types) prev_out = initial_embedding outputs = [] - - contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length) + if self._compute_layout != self._layout: + # Swap the axes if the compute_layout and layout mismatch + contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1), + valid_length) + contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1) + else: + contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length) outputs.append(contextual_embeddings) if self.use_pooler: pooled_out = self.apply_pooling(contextual_embeddings) @@ -315,24 +452,38 @@ def get_initial_embedding(self, F, inputs, token_types=None): ---------- F inputs - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) token_types - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If None, it will be initialized as all zero Returns ------- embedding The initial embedding that will be fed into the encoder + - layout = 'NT' + Shape (batch_size, seq_length, C_emb) + - layout = 'TN' + Shape (seq_length, batch_size, C_emb) """ + if self.layout == 'NT': + time_axis, batch_axis = 1, 0 + else: + time_axis, batch_axis = 0, 1 embedding = self.word_embed(inputs) if token_types is None: token_types = F.np.zeros_like(inputs) type_embedding = self.token_type_embed(token_types) embedding = embedding + type_embedding if self.pos_embed_type is not None: - positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=1)) - positional_embedding = F.np.expand_dims(positional_embedding, axis=0) + positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis)) + positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis) embedding = embedding + positional_embedding # Extra layer normalization plus dropout embedding = self.embed_layer_norm(embedding) @@ -345,53 +496,52 @@ def apply_pooling(self, sequence): This is used for pre-training or fine-tuning a bert model. Get the first token of the whole sequence which is [CLS] - sequence: - Shape (batch_size, sequence_length, units) + sequence + - layout = 'NT' + Shape (batch_size, sequence_length, units) + - layout = 'TN' + Shape (sequence_length, batch_size, units) return: Shape (batch_size, units) """ - outputs = sequence[:, 0, :] + if self.layout == 'NT': + outputs = sequence[:, 0, :] + else: + outputs = sequence[0, :, :] return self.pooler(outputs) @staticmethod def get_cfg(key=None): - if key is None: - cfg = CN() - # Parameters for thr small model - cfg.MODEL = CN() - cfg.MODEL.vocab_size = 30000 - cfg.MODEL.units = 256 - cfg.MODEL.hidden_size = 1024 - cfg.MODEL.max_length = 512 - cfg.MODEL.num_heads = 4 - cfg.MODEL.num_layers = 12 - cfg.MODEL.pos_embed_type = 'learned' - cfg.MODEL.activation = 'gelu' - cfg.MODEL.layer_norm_eps = 1E-12 - cfg.MODEL.num_token_types = 2 - cfg.MODEL.hidden_dropout_prob = 0.1 - cfg.MODEL.attention_dropout_prob = 0.1 - cfg.MODEL.dtype = 'float32' - # Hyper-parameters of the Initializers - cfg.INITIALIZER = CN() - cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] - cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) - cfg.INITIALIZER.bias = ['zeros'] - # Version of the model. This helps ensure backward compatibility. - # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26 - cfg.VERSION = 1 + if key is not None: + return bert_cfg_reg.create(key) else: - raise NotImplementedError - cfg.freeze() - return cfg + return google_en_uncased_bert_base() @classmethod - def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'BertModel': + def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'BertModel': + """ + + Parameters + ---------- + cfg + Configuration + use_pooler + Whether to output the pooled feature + dtype + data type of the model + + Returns + ------- + ret + The constructed BertModel + """ cfg = BertModel.get_cfg().clone_merge(cfg) assert cfg.VERSION == 1, 'Wrong version!' embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) bias_initializer = mx.init.create(*cfg.INITIALIZER.bias) + if dtype is None: + dtype = cfg.MODEL.dtype return cls(vocab_size=cfg.MODEL.vocab_size, units=cfg.MODEL.units, hidden_size=cfg.MODEL.hidden_size, @@ -408,7 +558,9 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'BertModel': embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - use_pooler=use_pooler) + use_pooler=use_pooler, + layout=cfg.MODEL.layout, + compute_layout=cfg.MODEL.compute_layout) @use_np @@ -447,6 +599,10 @@ def __init__(self, backbone_cfg, self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight self.mlm_decoder.hybridize() + @property + def layout(self): + return self.backbone_model.layout + def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): """Getting the scores of the masked positions. @@ -454,10 +610,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. @@ -471,14 +633,21 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). - pooled_out + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units) + cfg.MODEL.compute_layout = 'auto' Shape (batch_size, units) mlm_scores : Shape (batch_size, num_masked_positions, vocab_size) """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.layout == 'NT': + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + else: + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) mlm_scores = self.mlm_decoder(mlm_features) return contextual_embeddings, pooled_out, mlm_scores @@ -523,6 +692,10 @@ def __init__(self, backbone_cfg, self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight self.mlm_decoder.hybridize() + @property + def layout(self): + return self.backbone_model.layout + def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): """Generate the representation given the inputs. @@ -532,24 +705,33 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - valid_length : + valid_length The valid length of each sequence Shape (batch_size,) - masked_positions : + masked_positions The masked position of the sequence Shape (batch_size, num_masked_positions). Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) nsp_score : @@ -559,7 +741,11 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) nsp_score = self.nsp_classifier(pooled_out) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.layout == 'NT': + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + else: + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) mlm_scores = self.mlm_decoder(mlm_features) return contextual_embeddings, pooled_out, nsp_score, mlm_scores @@ -599,14 +785,21 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base', assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format( model_name, list_pretrained_bert()) cfg_path = PRETRAINED_URL[model_name]['cfg'] + if isinstance(cfg_path, CN): + cfg = cfg_path + else: + cfg = None vocab_path = PRETRAINED_URL[model_name]['vocab'] params_path = PRETRAINED_URL[model_name]['params'] mlm_params_path = PRETRAINED_URL[model_name]['mlm_params'] local_paths = dict() - for k, path in [('cfg', cfg_path), ('vocab', vocab_path)]: - local_paths[k] = download(url=get_repo_model_zoo_url() + path, - path=os.path.join(root, path), - sha1_hash=FILE_STATS[path]) + download_jobs = [('vocab', vocab_path)] + if cfg is None: + download_jobs.append(('cfg', cfg_path)) + for key, path in download_jobs: + local_paths[key] = download(url=get_repo_model_zoo_url() + path, + path=os.path.join(root, path), + sha1_hash=FILE_STATS[path]) if load_backbone: local_params_path = download(url=get_repo_model_zoo_url() + params_path, path=os.path.join(root, params_path), @@ -629,7 +822,8 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base', sep_token='[SEP]', mask_token='[MASK]', lowercase=do_lower) - cfg = BertModel.get_cfg().clone_merge(local_paths['cfg']) + if cfg is None: + cfg = BertModel.get_cfg().clone_merge(local_paths['cfg']) return cfg, tokenizer, local_params_path, local_mlm_params_path diff --git a/src/gluonnlp/models/electra.py b/src/gluonnlp/models/electra.py index a56d7879dc..b8d4e44029 100644 --- a/src/gluonnlp/models/electra.py +++ b/src/gluonnlp/models/electra.py @@ -43,9 +43,12 @@ from ..initializer import TruncNorm from ..utils.config import CfgNode as CN from ..utils.misc import load_checksum_stats, download +from ..utils.registry import Registry from ..attention_cell import gen_self_attn_mask from ..data.tokenizers import HuggingFaceWordPieceTokenizer +electra_cfg_reg = Registry('electra_cfg') + def get_generator_cfg(model_config): """ @@ -66,9 +69,73 @@ def get_generator_cfg(model_config): return generator_cfg +@electra_cfg_reg.register() +def google_electra_small(): + cfg = CN() + # Model + cfg.MODEL = CN() + cfg.MODEL.vocab_size = 30522 + cfg.MODEL.embed_size = 128 + cfg.MODEL.units = 256 + cfg.MODEL.hidden_size = 1024 + cfg.MODEL.max_length = 512 + cfg.MODEL.num_heads = 4 + cfg.MODEL.num_layers = 12 + cfg.MODEL.pos_embed_type = 'learned' + cfg.MODEL.activation = 'gelu' + cfg.MODEL.layer_norm_eps = 1E-12 + cfg.MODEL.num_token_types = 2 + # Dropout regularization + cfg.MODEL.hidden_dropout_prob = 0.1 + cfg.MODEL.attention_dropout_prob = 0.1 + cfg.MODEL.dtype = 'float32' + # Layout flags + cfg.MODEL.layout = 'NT' + cfg.MODEL.compute_layout = 'auto' + # Generator hyper-parameters + cfg.MODEL.generator_layers_scale = 1.0 + cfg.MODEL.generator_units_scale = 1.0 + # Initializer + cfg.INITIALIZER = CN() + cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] + cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) + cfg.INITIALIZER.bias = ['zeros'] + cfg.VERSION = 1 + cfg.freeze() + return cfg + + +@electra_cfg_reg.register() +def google_electra_base(): + cfg = google_electra_small() + cfg.defrost() + cfg.MODEL.embed_size = 768 + cfg.MODEL.units = 768 + cfg.MODEL.hidden_size = 3072 + cfg.MODEL.num_heads = 12 + cfg.MODEL.num_layers = 12 + cfg.MODEL.generator_units_scale = 0.33333 + cfg.freeze() + return cfg + + +@electra_cfg_reg.register() +def google_electra_large(): + cfg = google_electra_small() + cfg.defrost() + cfg.MODEL.embed_size = 1024 + cfg.MODEL.units = 1024 + cfg.MODEL.hidden_size = 4096 + cfg.MODEL.num_heads = 16 + cfg.MODEL.num_layers = 24 + cfg.MODEL.generator_units_scale = 0.25 + cfg.freeze() + return cfg + + PRETRAINED_URL = { 'google_electra_small': { - 'cfg': 'google_electra_small/model-9ffb21c8.yml', + 'cfg': google_electra_small(), 'vocab': 'google_electra_small/vocab-e6d2b21d.json', 'params': 'google_electra_small/model-2654c8b4.params', 'disc_model': 'google_electra_small/disc_model-137714b6.params', @@ -76,7 +143,7 @@ def get_generator_cfg(model_config): 'lowercase': True, }, 'google_electra_base': { - 'cfg': 'google_electra_base/model-5b35ca0b.yml', + 'cfg': google_electra_base(), 'vocab': 'google_electra_base/vocab-e6d2b21d.json', 'params': 'google_electra_base/model-31c235cc.params', 'disc_model': 'google_electra_base/disc_model-514bd353.params', @@ -84,7 +151,7 @@ def get_generator_cfg(model_config): 'lowercase': True, }, 'google_electra_large': { - 'cfg': 'google_electra_large/model-31b7dfdd.yml', + 'cfg': google_electra_large(), 'vocab': 'google_electra_large/vocab-e6d2b21d.json', 'params': 'google_electra_large/model-9baf9ff5.params', 'disc_model': 'google_electra_large/disc_model-5b820c02.params', @@ -96,6 +163,7 @@ def get_generator_cfg(model_config): FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'electra.txt')) +# TODO(sxjscience) Use BertTransformer @use_np class ElectraEncoder(HybridBlock): def __init__(self, units=512, @@ -110,7 +178,35 @@ def __init__(self, units=512, layer_norm_eps=1E-12, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', - activation='gelu'): + activation='gelu', + layout='NT'): + """ + + Parameters + ---------- + units + The number of units + hidden_size + The hidden size + num_layers + Number of layers + num_heads + Number of heads + attention_dropout_prob + Dropout probability of the attention layer + hidden_dropout_prob + Dropout probability + output_attention + Whether to output the attention weights + dtype + Data type of the weights + output_all_encodings + layer_norm_eps + weight_initializer + bias_initializer + activation + layout + """ super().__init__() assert units % num_heads == 0, \ 'In ElectraEncoder, The units should be divisible ' \ @@ -118,6 +214,7 @@ def __init__(self, units=512, .format(units, num_heads) self._dtype = dtype + self._layout = layout self._num_layers = num_layers self._output_attention = output_attention @@ -134,7 +231,13 @@ def __init__(self, units=512, layer_norm_eps=layer_norm_eps, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - activation=activation)) + activation=activation, + dtype=dtype, + layout=layout)) + + @property + def layout(self): + return self._layout def hybrid_forward(self, F, data, valid_length): """ @@ -145,18 +248,31 @@ def hybrid_forward(self, F, data, valid_length): Parameters ---------- F - data : - Shape (batch_size, seq_length, C) - valid_length : + data + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) + valid_length Shape (batch_size,) Returns ------- - out : - Shape (batch_size, seq_length, C_out) + out + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) """ + if self.layout == 'NT': + time_axis, batch_axis = 1, 0 + else: + time_axis, batch_axis = 0, 1 # 1. Embed the data - attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full') + attn_mask = gen_self_attn_mask(F, data, valid_length, + dtype=self._dtype, + layout=self._layout, + attn_type='full') out = data all_encodings_outputs = [] additional_outputs = [] @@ -168,7 +284,8 @@ def hybrid_forward(self, F, data, valid_length): if self._output_all_encodings: out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, + axis=time_axis) all_encodings_outputs.append(out) if self._output_attention: @@ -177,7 +294,7 @@ def hybrid_forward(self, F, data, valid_length): if not self._output_all_encodings: # if self._output_all_encodings, SequenceMask is already applied above out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, axis=time_axis) return out, additional_outputs else: return all_encodings_outputs, additional_outputs @@ -208,7 +325,9 @@ def __init__(self, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', dtype='float32', - use_pooler=True): + use_pooler=True, + layout='NT', + compute_layout='auto'): super().__init__() self._dtype = dtype self.use_pooler = use_pooler @@ -223,6 +342,11 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps + self._layout = layout + if compute_layout is None or compute_layout == 'auto': + self._compute_layout = layout + else: + self._compute_layout = compute_layout # Construct ElectraEncoder self.encoder = ElectraEncoder( units=units, @@ -238,6 +362,7 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=dtype, + layout=self._compute_layout, ) self.encoder.hybridize() @@ -262,6 +387,10 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer) + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ """Generate the representation given the inputs. @@ -271,22 +400,31 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - valid_length : + valid_length The valid length of each sequence Shape (batch_size,) Returns ------- - contextual_embedding : - Shape (batch_size, seq_length, units). - pooled_output : + contextual_embedding + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). + pooled_output This is optional. Shape (batch_size, units) """ initial_embedding = self.get_initial_embedding(F, inputs, token_types) @@ -295,17 +433,27 @@ def hybrid_forward(self, F, inputs, token_types, valid_length=None): if self.embed_size != self.units: prev_out = self.embed_factorized_proj(prev_out) outputs = [] - contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length) + if self._compute_layout != self._layout: + # Swap the axes if the compute_layout and layout mismatch + contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1), + valid_length) + contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1) + else: + contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length) outputs.append(contextual_embeddings) if self.use_pooler: # Here we just get the first token ([CLS]) without any pooling strategy, - # which is slightly different between bert model with the pooled_out + # which is slightly different from bert model with the pooled_out # the attribute name is keeping the same as bert and albert model with defualt # use_pooler=True - pooled_out = contextual_embeddings[:, 0, :] + if self._layout == 'NT': + pooled_out = contextual_embeddings[:, 0, :] + else: + pooled_out = contextual_embeddings[0, :, :] outputs.append(pooled_out) return tuple(outputs) if len(outputs) > 1 else outputs[0] + #TODO(sxjscience) Move to a `common.py` def get_initial_embedding(self, F, inputs, token_types=None): """Get the initial token embeddings that considers the token type and positional embeddings @@ -313,24 +461,38 @@ def get_initial_embedding(self, F, inputs, token_types=None): ---------- F inputs - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) token_types - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If None, it will be initialized as all zero Returns ------- embedding The initial embedding that will be fed into the encoder + - layout = 'NT' + Shape (batch_size, seq_length, C_embed) + - layout = 'TN' + Shape (seq_length, batch_size, C_embed) """ + if self.layout == 'NT': + time_axis, batch_axis = 1, 0 + else: + time_axis, batch_axis = 0, 1 embedding = self.word_embed(inputs) if token_types is None: token_types = F.np.zeros_like(inputs) type_embedding = self.token_type_embed(token_types) embedding = embedding + type_embedding if self.pos_embed_type is not None: - positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=1)) - positional_embedding = F.np.expand_dims(positional_embedding, axis=0) + positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis)) + positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis) embedding = embedding + positional_embedding # Extra layer normalization plus dropout embedding = self.embed_layer_norm(embedding) @@ -339,48 +501,20 @@ def get_initial_embedding(self, F, inputs, token_types=None): @staticmethod def get_cfg(key=None): - if key is None: - cfg = CN() - # Model Parameters for the electra small - cfg.MODEL = CN() - cfg.MODEL.vocab_size = 30522 - cfg.MODEL.embed_size = 128 - cfg.MODEL.units = 256 - cfg.MODEL.hidden_size = 1024 - cfg.MODEL.max_length = 512 - cfg.MODEL.num_heads = 4 - cfg.MODEL.num_layers = 12 - cfg.MODEL.pos_embed_type = 'learned' - # Unlike BERT and ALBERT, which ues gelu(tanh), the gelu(erf) is used in Electra. - cfg.MODEL.activation = 'gelu' - cfg.MODEL.layer_norm_eps = 1E-12 - cfg.MODEL.num_token_types = 2 - cfg.MODEL.hidden_dropout_prob = 0.1 - cfg.MODEL.attention_dropout_prob = 0.1 - cfg.MODEL.dtype = 'float32' - cfg.MODEL.generator_layers_scale = 1.0 - # multiplier for units, hidden_size, and num_heads - cfg.MODEL.generator_units_scale = 1.0 - # Hyper-parameters of the Initializers - cfg.INITIALIZER = CN() - cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] - cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) - cfg.INITIALIZER.bias = ['zeros'] - # Version of the model. This helps ensure backward compatibility. - # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26 - cfg.VERSION = 1 - cfg.freeze() + if key is not None: + return electra_cfg_reg.create(key) else: - raise NotImplementedError - return cfg + return google_electra_base() @classmethod - def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'ElectraModel': + def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'ElectraModel': cfg = ElectraModel.get_cfg().clone_merge(cfg) assert cfg.VERSION == 1, 'Wrong version!' embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) bias_initializer = mx.init.create(*cfg.INITIALIZER.bias) + if dtype is None: + dtype = cfg.MODEL.dtype return cls(vocab_size=cfg.MODEL.vocab_size, units=cfg.MODEL.units, hidden_size=cfg.MODEL.hidden_size, @@ -398,7 +532,9 @@ def from_cfg(cls, cfg, use_pooler=True, dtype='float32') -> 'ElectraModel': embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - use_pooler=use_pooler) + use_pooler=use_pooler, + layout=cfg.MODEL.layout, + compute_layout=cfg.MODEL.compute_layout) @use_np @@ -447,25 +583,37 @@ def hybrid_forward(self, F, inputs, token_types, valid_length): Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - valid_length : + valid_length The valid length of each sequence Shape (batch_size,) Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) rtd_scores - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) rtd_scores = self.rtd_encoder(contextual_embeddings).squeeze(-1) @@ -515,8 +663,21 @@ def __init__(self, backbone_cfg, self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight self.mlm_decoder.hybridize() - def tie_embeddings(self, word_embed_params=None, token_type_embed_params=None, - token_pos_embed_params=None, embed_layer_norm_params=None): + # TODO(sxjscience,zheyu) Should design a better API + def tie_embeddings(self, word_embed_params=None, + token_type_embed_params=None, + token_pos_embed_params=None, + embed_layer_norm_params=None): + """Tie the embedding layers between the backbone and the MLM decoder + + Parameters + ---------- + word_embed_params + token_type_embed_params + token_pos_embed_params + embed_layer_norm_params + + """ self.backbone_model.word_embed.share_parameters(word_embed_params) self.mlm_decoder[-1].share_parameters(word_embed_params) self.backbone_model.token_type_embed.share_parameters(token_type_embed_params) @@ -529,10 +690,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions) Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. @@ -546,14 +713,21 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions) Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) mlm_scores : Shape (batch_size, num_masked_positions, vocab_size) """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.backbone_model.layout == 'NT': + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + else: + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) mlm_scores = self.mlm_decoder(mlm_features) return contextual_embeddings, pooled_out, mlm_scores @@ -561,7 +735,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions) @use_np class ElectraForPretrain(HybridBlock): """ - A integrated model combined with a generator and a discriminator. Generator here + An integrated model combined with a generator and a discriminator. Generator here produces a corrupted tokens playing as fake data to fool a discriminator whose objective is to distinguish whether each token in the input sentence it accepts is the same as the original. It is a classification task instead of prediction @@ -612,11 +786,15 @@ def __init__(self, self.disc_cfg = disc_cfg self.vocab_size = disc_cfg.MODEL.vocab_size self.gen_cfg = get_generator_cfg(disc_cfg) - self.discriminator = ElectraDiscriminator(disc_cfg) + self.discriminator = ElectraDiscriminator(disc_cfg, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) self.disc_backbone = self.discriminator.backbone_model if not uniform_generator and not tied_generator: - self.generator = ElectraGenerator(self.gen_cfg) + self.generator = ElectraGenerator(self.gen_cfg, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) if tied_embeddings: self.generator.tie_embeddings(self.disc_backbone.word_embed.collect_params(), self.disc_backbone.token_type_embed.collect_params(), @@ -626,7 +804,10 @@ def __init__(self, elif tied_generator: # Reuse the weight of the discriminator backbone model - self.generator = ElectraGenerator(self.gen_cfg) + self.generator = ElectraGenerator(self.gen_cfg, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + # TODO(sxjscience, zheyu) Verify self.generator.backbone_model = self.disc_backbone self.generator.hybridize() elif uniform_generator: @@ -650,18 +831,24 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : + inputs The masked input - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - valid_length : + valid_length The valid length of each sequence Shape (batch_size,) - unmasked_tokens : + unmasked_tokens The original tokens that appear in the unmasked input sequence Shape (batch_size, num_masked_positions). masked_positions : @@ -670,20 +857,26 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Returns ------- - mlm_scores : + mlm_scores Shape (batch_size, num_masked_positions, vocab_size) - rtd_scores : - Shape (batch_size, seq_length) + rtd_scores + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) replaced_inputs : Shape (batch_size, num_masked_positions) - labels : - Shape (batch_size, seq_length) + labels + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) """ if self._uniform_generator: # generate the corrupt tokens randomly with a mlm_scores vector whose value is all 0 - zero_logits = F.np.zeros(self.vocab_size) - zero_logits = F.np.expand_dims(F.np.expand_dims(zero_logits, axis=0), axis=0) - mlm_scores = F.np.expand_dims(F.np.zeros_like(masked_positions), axis=-1) + zero_logits = F.np.zeros((1, 1, self.vocab_size), dtype=self._dtype) + mlm_scores = F.np.expand_dims(F.np.zeros_like(masked_positions, dtype=self._dtype), + axis=-1) mlm_scores = mlm_scores + zero_logits else: _, _, mlm_scores = self.generator(inputs, token_types, valid_length, masked_positions) @@ -698,12 +891,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, logits): """ Sample from the generator to create corrupted input. + Parameters ---------- F inputs The masked input - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) unmasked_tokens The original tokens that appear in the unmasked input sequence Shape (batch_size, num_masked_positions). @@ -715,10 +912,18 @@ def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, log Returns ------- + corrupted_tokens + The corrupted tokens fake_data - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) labels - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) """ if self._disallow_correct: @@ -734,6 +939,8 @@ def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, log use_np_gumbel=False) corrupted_tokens = F.np.argmax(prob, axis=-1).astype(np.int32) + if self.disc_backbone.layout == 'TN': + inputs = inputs.T # Following the Official electra to deal with duplicate positions as # https://github.com/google-research/electra/issues/41 original_data, updates_mask = updated_vectors_by_position(F, @@ -742,7 +949,10 @@ def get_corrupted_tokens(self, F, inputs, unmasked_tokens, masked_positions, log inputs, corrupted_tokens, masked_positions) labels = updates_mask * F.np.not_equal(fake_data, original_data) - return corrupted_tokens, fake_data, labels + if self.disc_backbone.layout == 'TN': + return corrupted_tokens, fake_data.T, labels.T + else: + return corrupted_tokens, fake_data, labels def list_pretrained_electra(): @@ -787,13 +997,20 @@ def get_pretrained_electra(model_name: str = 'google_electra_small', assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format( model_name, list_pretrained_electra()) cfg_path = PRETRAINED_URL[model_name]['cfg'] + if isinstance(cfg_path, CN): + cfg = cfg_path + else: + cfg = None vocab_path = PRETRAINED_URL[model_name]['vocab'] params_path = PRETRAINED_URL[model_name]['params'] disc_params_path = PRETRAINED_URL[model_name]['disc_model'] gen_params_path = PRETRAINED_URL[model_name]['gen_model'] local_paths = dict() - for k, path in [('cfg', cfg_path), ('vocab', vocab_path)]: + download_jobs = [('vocab', vocab_path)] + if cfg is None: + download_jobs.append(('cfg', cfg_path)) + for k, path in download_jobs: local_paths[k] = download(url=get_repo_model_zoo_url() + path, path=os.path.join(root, path), sha1_hash=FILE_STATS[path]) @@ -827,7 +1044,8 @@ def get_pretrained_electra(model_name: str = 'google_electra_small', sep_token='[SEP]', mask_token='[MASK]', lowercase=do_lower) - cfg = ElectraModel.get_cfg().clone_merge(local_paths['cfg']) + if cfg is None: + cfg = ElectraModel.get_cfg().clone_merge(local_paths['cfg']) return cfg, tokenizer, local_params_path, (local_disc_params_path, local_gen_params_path) diff --git a/src/gluonnlp/models/mobilebert.py b/src/gluonnlp/models/mobilebert.py index 502d7f4750..5a81de7c64 100644 --- a/src/gluonnlp/models/mobilebert.py +++ b/src/gluonnlp/models/mobilebert.py @@ -41,6 +41,7 @@ from ..initializer import TruncNorm from ..utils.config import CfgNode as CN from ..utils.misc import load_checksum_stats, download +from ..utils.registry import Registry from ..registry import BACKBONE_REGISTRY from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask from ..data.tokenizers import HuggingFaceWordPieceTokenizer @@ -48,9 +49,51 @@ __all__ = ['MobileBertModel', 'MobileBertForMLM', 'MobileBertForPretrain', 'list_pretrained_mobilebert', 'get_pretrained_mobilebert'] +mobilebert_cfg_reg = Registry('mobilebert_cfg') + + +@mobilebert_cfg_reg.register() +def google_uncased_mobilebert(): + cfg = CN() + cfg.MODEL = CN() + cfg.MODEL.vocab_size = 30522 + cfg.MODEL.units = 512 + cfg.MODEL.embed_size = 128 + cfg.MODEL.inner_size = 128 + cfg.MODEL.hidden_size = 512 + cfg.MODEL.max_length = 512 + cfg.MODEL.num_heads = 4 + cfg.MODEL.num_layers = 24 + + cfg.MODEL.use_bottleneck = True # Whether to use bottleneck + cfg.MODEL.trigram_embed = True # Trigram embedding + cfg.MODEL.classifier_activation = False # Whether to use an additional pooling layer + cfg.MODEL.bottleneck_strategy = 'qk_sharing' + cfg.MODEL.num_stacked_ffn = 4 + cfg.MODEL.pos_embed_type = 'learned' + cfg.MODEL.activation = 'relu' + cfg.MODEL.num_token_types = 2 + cfg.MODEL.hidden_dropout_prob = 0.0 + cfg.MODEL.attention_dropout_prob = 0.1 + cfg.MODEL.normalization = 'no_norm' + cfg.MODEL.layer_norm_eps = 1E-12 + cfg.MODEL.dtype = 'float32' + # Layout flags + cfg.MODEL.layout = 'NT' + cfg.MODEL.compute_layout = 'auto' + # Initializer + cfg.INITIALIZER = CN() + cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] + cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) + cfg.INITIALIZER.bias = ['zeros'] + cfg.VERSION = 1 + cfg.freeze() + return cfg + + PRETRAINED_URL = { 'google_uncased_mobilebert': { - 'cfg': 'google_uncased_mobilebert/model-1c33216b.yml', + 'cfg': google_uncased_mobilebert(), 'vocab': 'google_uncased_mobilebert/vocab-e6d2b21d.json', 'params': 'google_uncased_mobilebert/model-c8346cf2.params', 'mlm_params': 'google_uncased_mobilebert/model_mlm-53948e82.params', @@ -66,7 +109,7 @@ class MobileBertEncoderLayer(HybridBlock): """The Transformer Encoder Layer in Mobile Bert""" # TODO(zheyuye), use stacked groups for single ffn layer in transformer.TransformerEncoderLayer - # and revise the other models and scripts, masking sure their are compatible. + # and revise the other models and scripts, making sure they are compatible. def __init__(self, use_bottleneck: bool = True, @@ -85,12 +128,14 @@ def __init__(self, use_qkv_bias: bool = True, weight_initializer: Optional[InitializerType] = None, bias_initializer: Optional[InitializerType] = 'zeros', - dtype='float32'): + dtype='float32', + layout='NT'): """ Parameters ---------- use_bottleneck + Whether to use the bottleneck layer. units size of inter-bottleneck real_units @@ -110,6 +155,9 @@ def __init__(self, weight_initializer bias_initializer dtype + Data type of the block + layout + Layout of the input + output """ super().__init__() self._use_bottleneck = use_bottleneck @@ -119,6 +167,7 @@ def __init__(self, self._num_stacked_ffn = num_stacked_ffn self._bottleneck_strategy = bottleneck_strategy self._dtype = dtype + self._layout = layout assert real_units % num_heads == 0, 'units must be divisive by the number of heads' self.dropout_layer = nn.Dropout(hidden_dropout_prob) if use_bottleneck: @@ -159,24 +208,47 @@ def __init__(self, bias_initializer=bias_initializer, dtype=self._dtype) # The in_units of qkv varies according to the sharing strategy + if self._use_bottleneck: + if self._bottleneck_strategy == 'qk_sharing': + attn_query_in_units = real_units + attn_key_in_units = real_units + attn_value_in_units = units + elif self._bottleneck_strategy == 'from_bottleneck': + attn_query_in_units = real_units + attn_key_in_units = real_units + attn_value_in_units = real_units + elif self._bottleneck_strategy == 'from_input': + attn_query_in_units = units + attn_key_in_units = units + attn_value_in_units = units + else: + raise NotImplementedError + else: + attn_query_in_units = units + attn_key_in_units = units + attn_value_in_units = units self.attn_query = nn.Dense(units=real_units, + in_units=attn_query_in_units, flatten=False, use_bias=use_qkv_bias, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=self._dtype) self.attn_key = nn.Dense(units=real_units, + in_units=attn_key_in_units, flatten=False, use_bias=use_qkv_bias, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=self._dtype) self.attn_value = nn.Dense(units=real_units, + in_units=attn_value_in_units, flatten=False, use_bias=use_qkv_bias, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=self._dtype) + attention_layout = 'NTK' if self._layout == 'NT' else 'TNK' self.attention_cell = \ MultiHeadAttentionCell( query_units=real_units, @@ -184,7 +256,7 @@ def __init__(self, attention_dropout=attention_dropout_prob, scaled=True, dtype=self._dtype, - layout='NTK' + layout=attention_layout ) self.layer_norm = get_layer_norm(normalization=normalization, in_channels=real_units, @@ -209,26 +281,35 @@ def __init__(self, layer_norm_eps=layer_norm_eps, dtype=self._dtype)) + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, data, attn_mask): """ Parameters ---------- F - data : - Shape (batch_size, seq_length, C_in) - attn_mask : + data + - layout = 'NT' + Shape (batch_size, seq_length, C_in) + - layout = 'TN' + Shape (seq_length, batch_size, C_in) + attn_mask + The attention mask Shape (batch_size, seq_length, seq_length) Returns ------- - out : - Shape (batch_size, seq_length, C_out) - attn_weight : + out + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) + attn_weight Shape (batch_size, seq_length, seq_length) """ - # TODO(sxjscience) Cannot use negative axis due to - # https://github.com/apache/incubator-mxnet/issues/18132 if self._use_bottleneck: bn_proj = self.in_bottleneck_proj(data) bn_proj = self.in_bottleneck_ln(bn_proj) @@ -241,7 +322,7 @@ def hybrid_forward(self, F, data, attn_mask): key = qk_shared value = data elif self._bottleneck_strategy == 'from_bottleneck': - # for Mobile mobile bert Tiny + # for Mobile Bert Tiny query = bn_proj key = bn_proj value = bn_proj @@ -298,12 +379,14 @@ def __init__(self, layer_norm_eps: float = 1E-12, weight_initializer: InitializerType = TruncNorm(stdev=0.02), bias_initializer: InitializerType = 'zeros', - dtype='float32'): + dtype='float32', + layout='NT'): super().__init__() self._dtype = dtype self._num_layers = num_layers self._output_attention = output_attention self._output_all_encodings = output_all_encodings + self._layout = layout assert bottleneck_strategy in ['qk_sharing', 'from_bottleneck', 'from_input'], \ 'The bottleneck strategy={} is not supported.'.format(bottleneck_strategy) @@ -329,7 +412,12 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, normalization=normalization, - activation=activation)) + activation=activation, + layout=layout)) + + @property + def layout(self): + return self._layout def hybrid_forward(self, F, data, valid_length): """ @@ -340,18 +428,34 @@ def hybrid_forward(self, F, data, valid_length): Parameters ---------- F - data : - Shape (batch_size, seq_length, C) - valid_length : + data + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) + valid_length Shape (batch_size,) Returns ------- - out : - Shape (batch_size, seq_length, C_out) + out + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) """ + if self._layout == 'NT': + batch_axis, time_axis = 0, 1 + elif self._layout == 'TN': + batch_axis, time_axis = 1, 0 + else: + raise NotImplementedError('Received layout="{}". ' + 'Only "NT" and "TN" are supported.'.format(self._layout)) # 1. Embed the data - attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype, attn_type='full') + attn_mask = gen_self_attn_mask(F, data, valid_length, + dtype=self._dtype, + layout=self._layout, + attn_type='full') out = data all_encodings_outputs = [] additional_outputs = [] @@ -364,7 +468,8 @@ def hybrid_forward(self, F, data, valid_length): if self._output_all_encodings: out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, + axis=time_axis) all_encodings_outputs.append(out) if self._output_attention: @@ -373,7 +478,8 @@ def hybrid_forward(self, F, data, valid_length): if not self._output_all_encodings: # if self._output_all_encodings, SequenceMask is already applied above out = F.npx.sequence_mask(out, sequence_length=valid_length, - use_sequence_length=True, axis=1) + use_sequence_length=True, + axis=time_axis) return out, additional_outputs else: return all_encodings_outputs, additional_outputs @@ -406,7 +512,9 @@ def __init__(self, trigram_embed=True, use_pooler=True, classifier_activation=False, - dtype='float32'): + dtype='float32', + layout='NT', + compute_layout='auto'): super().__init__() self._dtype = dtype self.use_bottleneck = use_bottleneck @@ -428,6 +536,12 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps + self._layout = layout + if compute_layout == 'auto' or compute_layout is None: + self._compute_layout = layout + else: + assert compute_layout in ['TN', 'NT'] + self._compute_layout = compute_layout # Construct MobileBertTransformer self.encoder = MobileBertTransformer( units=units, @@ -447,6 +561,7 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=dtype, + layout=self._compute_layout, ) self.encoder.hybridize() # Construct word embedding @@ -455,7 +570,12 @@ def __init__(self, weight_initializer=embed_initializer, dtype=dtype) if trigram_embed or embed_size != units: + if trigram_embed: + in_units = 3 * embed_size + else: + in_units = embed_size self.embed_factorized_proj = nn.Dense(units=units, + in_units=in_units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer) @@ -467,7 +587,8 @@ def __init__(self, # Construct token type embedding self.token_type_embed = nn.Embedding(input_dim=num_token_types, output_dim=units, - weight_initializer=weight_initializer) + weight_initializer=weight_initializer, + dtype=self._dtype) self.token_pos_embed = PositionalEmbedding(units=units, max_length=max_length, dtype=self._dtype, @@ -478,9 +599,18 @@ def __init__(self, in_units=units, flatten=False, activation='tanh', + dtype=self._dtype, weight_initializer=weight_initializer, bias_initializer=bias_initializer) + @property + def layout(self): + return self._layout + + @property + def dtype(self): + return self._dtype + def hybrid_forward(self, F, inputs, token_types, valid_length): # pylint: disable=arguments-differ """Generate the representation given the inputs. @@ -490,11 +620,16 @@ def hybrid_forward(self, F, inputs, token_types, valid_length): Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) - + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. valid_length : @@ -510,24 +645,34 @@ def hybrid_forward(self, F, inputs, token_types, valid_length): """ embedding = self.get_initial_embedding(F, inputs, token_types) - contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length) - outputs = [] - outputs.append(contextual_embeddings) + if self._compute_layout != self._layout: + contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(embedding, 0, 1), + valid_length) + contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1) + else: + contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length) if self.use_pooler: pooled_out = self.apply_pooling(contextual_embeddings) - outputs.append(pooled_out) - return tuple(outputs) if len(outputs) > 1 else outputs[0] + return contextual_embeddings, pooled_out + else: + return contextual_embeddings - def get_initial_embedding(self, F, inputs, token_types=None, trigram_embed=True): + def get_initial_embedding(self, F, inputs, token_types=None): """Get the initial token embeddings that considers the token type and positional embeddings Parameters ---------- F inputs - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) token_types - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If None, it will be initialized as all zero Returns @@ -535,24 +680,39 @@ def get_initial_embedding(self, F, inputs, token_types=None, trigram_embed=True) embedding The initial embedding that will be fed into the encoder """ + if self._layout == 'NT': + batch_axis, time_axis = 0, 1 + elif self._layout == 'TN': + batch_axis, time_axis = 1, 0 + else: + raise NotImplementedError word_embedding = self.word_embed(inputs) - if trigram_embed: - word_embedding = F.np.concatenate( - [F.np.pad(word_embedding[:, 1:], ((0, 0), (0, 1), (0, 0))), - word_embedding, - F.np.pad(word_embedding[:, :-1], ((0, 0), (1, 0), (0, 0)))], axis=-1) + if self.trigram_embed: + if self._layout == 'NT': + word_embedding = F.np.concatenate( + [F.np.pad(word_embedding[:, 1:], ((0, 0), (0, 1), (0, 0))), + word_embedding, + F.np.pad(word_embedding[:, :-1], ((0, 0), (1, 0), (0, 0)))], axis=-1) + elif self._layout == 'TN': + word_embedding = F.np.concatenate( + [F.np.pad(word_embedding[1:, :], ((0, 1), (0, 0), (0, 0))), + word_embedding, + F.np.pad(word_embedding[:-1, :], ((1, 0), (0, 0), (0, 0)))], axis=-1) + else: + raise NotImplementedError # Projecting the embedding into units only for word embedding - if trigram_embed or self.embed_size != self.units: - embedding = self.embed_factorized_proj(word_embedding) + if self.trigram_embed or self.embed_size != self.units: + word_embedding = self.embed_factorized_proj(word_embedding) if token_types is None: - token_types = F.np.zeros_like(embedding) + token_types = F.np.zeros_like(inputs) type_embedding = self.token_type_embed(token_types) - embedding = embedding + type_embedding + embedding = word_embedding + type_embedding if self.pos_embed_type is not None: - positional_embedding = self.token_pos_embed(F.npx.arange_like(embedding, axis=1)) - positional_embedding = F.np.expand_dims(positional_embedding, axis=0) + positional_embedding =\ + self.token_pos_embed(F.npx.arange_like(embedding, axis=time_axis)) + positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis) embedding = embedding + positional_embedding # Extra layer normalization plus dropout embedding = self.embed_layer_norm(embedding) @@ -565,12 +725,23 @@ def apply_pooling(self, sequence): This is used for pre-training or fine-tuning a mobile bert model. Get the first token of the whole sequence which is [CLS] - sequence: - Shape (batch_size, sequence_length, units) - return: + Parameters + ---------- + sequence + - layout = 'NT' + Shape (batch_size, sequence_length, units) + - layout = 'TN' + Shape (sequence_length, batch_size, units) + + Returns + ------- + outputs Shape (batch_size, units) """ - outputs = sequence[:, 0, :] + if self._layout == 'NT': + outputs = sequence[:, 0, :] + else: + outputs = sequence[0, :, :] if self.classifier_activation: return self.pooler(outputs) else: @@ -578,53 +749,23 @@ def apply_pooling(self, sequence): @staticmethod def get_cfg(key=None): - if key is None: - cfg = CN() - cfg.MODEL = CN() - cfg.MODEL.vocab_size = 30522 - cfg.MODEL.embed_size = 128 - cfg.MODEL.units = 512 - cfg.MODEL.hidden_size = 512 - cfg.MODEL.inner_size = 128 - cfg.MODEL.max_length = 512 - cfg.MODEL.num_heads = 4 - cfg.MODEL.num_layers = 12 - cfg.MODEL.num_stacked_ffn = 4 - cfg.MODEL.pos_embed_type = 'learned' - cfg.MODEL.activation = 'relu' - cfg.MODEL.normalization = 'no_norm' - cfg.MODEL.layer_norm_eps = 1E-12 - cfg.MODEL.bottleneck_strategy = 'qk_sharing' - cfg.MODEL.num_token_types = 2 - cfg.MODEL.hidden_dropout_prob = 0.0 - cfg.MODEL.attention_dropout_prob = 0.1 - cfg.MODEL.dtype = 'float32' - # Hyper-parameters of the Initializers - cfg.INITIALIZER = CN() - cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] - cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] # TruncNorm(0, 0.02) - cfg.INITIALIZER.bias = ['zeros'] - # Version of the model. This helps ensure backward compatibility. - # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26 - cfg.VERSION = 1 + if key is not None: + return mobilebert_cfg_reg.create(key) else: - raise NotImplementedError - cfg.freeze() - return cfg + return google_uncased_mobilebert() @classmethod def from_cfg(cls, cfg, use_pooler=True, - dtype='float32', - use_bottleneck=True, - trigram_embed=True, - classifier_activation=False) -> 'MobileBertModel': + dtype=None) -> 'MobileBertModel': cfg = MobileBertModel.get_cfg().clone_merge(cfg) assert cfg.VERSION == 1, 'Wrong version!' embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) bias_initializer = mx.init.create(*cfg.INITIALIZER.bias) + if dtype is None: + dtype = cfg.MODEL.dtype return cls(vocab_size=cfg.MODEL.vocab_size, units=cfg.MODEL.units, hidden_size=cfg.MODEL.hidden_size, @@ -646,17 +787,17 @@ def from_cfg(cls, embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - use_bottleneck=use_bottleneck, - trigram_embed=trigram_embed, + use_bottleneck=cfg.MODEL.use_bottleneck, + trigram_embed=cfg.MODEL.trigram_embed, use_pooler=use_pooler, - classifier_activation=classifier_activation) + classifier_activation=cfg.MODEL.classifier_activation, + layout=cfg.MODEL.layout, + compute_layout=cfg.MODEL.compute_layout) @use_np class MobileBertForMLM(HybridBlock): def __init__(self, backbone_cfg, - use_bottleneck=True, - trigram_embed=True, weight_initializer=None, bias_initializer=None): """ @@ -668,9 +809,7 @@ def __init__(self, backbone_cfg, bias_initializer """ super().__init__() - self.backbone_model = MobileBertModel.from_cfg(backbone_cfg, - use_bottleneck=use_bottleneck, - trigram_embed=trigram_embed) + self.backbone_model = MobileBertModel.from_cfg(backbone_cfg) if weight_initializer is None: weight_initializer = self.backbone_model.weight_initializer if bias_initializer is None: @@ -680,7 +819,8 @@ def __init__(self, backbone_cfg, self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, flatten=False, weight_initializer=weight_initializer, - bias_initializer=bias_initializer)) + bias_initializer=bias_initializer, + dtype=self.backbone_model.dtype)) self.mlm_decoder.add(get_activation(self.backbone_model.activation)) # use basic layer normalization for pretaining self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) @@ -692,14 +832,14 @@ def __init__(self, backbone_cfg, units=self.backbone_model.vocab_size, in_units=self.backbone_model.embed_size, flatten=False, + dtype=self.backbone_model.dtype, bias_initializer=bias_initializer) self.embedding_table.weight = self.backbone_model.word_embed.weight if self.backbone_model.embed_size != self.backbone_model.units: self.extra_table = nn.Dense( units=self.backbone_model.vocab_size, use_bias=False, - in_units=self.backbone_model.units - - self.backbone_model.embed_size, + in_units=self.backbone_model.units - self.backbone_model.embed_size, flatten=False) def hybrid_forward(self, F, inputs, token_types, valid_length, @@ -709,30 +849,43 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) The type of the token. For example, if the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - valid_length : + valid_length The valid length of each sequence Shape (batch_size,) - masked_positions : + masked_positions The masked position of the sequence Shape (batch_size, num_masked_positions). Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) - mlm_scores : + mlm_scores Shape (batch_size, num_masked_positions, vocab_size) """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.backbone_model.layout == 'TN': + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) + else: + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) intermediate_output = self.mlm_decoder(mlm_features) if self.backbone_model.embed_size != self.backbone_model.units: scores = self.embedding_table( @@ -748,8 +901,6 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, @use_np class MobileBertForPretrain(HybridBlock): def __init__(self, backbone_cfg, - use_bottleneck=True, - trigram_embed=True, weight_initializer=None, bias_initializer=None): """ @@ -762,22 +913,22 @@ def __init__(self, backbone_cfg, bias_initializer """ super().__init__() - self.backbone_model = MobileBertModel.from_cfg(backbone_cfg, - use_bottleneck=use_bottleneck, - trigram_embed=trigram_embed) + self.backbone_model = MobileBertModel.from_cfg(backbone_cfg) if weight_initializer is None: weight_initializer = self.backbone_model.weight_initializer if bias_initializer is None: bias_initializer = self.backbone_model.bias_initializer # Construct nsp_classifier for next sentence prediction self.nsp_classifier = nn.Dense(units=2, - weight_initializer=weight_initializer) + weight_initializer=weight_initializer, + dtype=self.backbone_model.dtype) self.mlm_decoder = nn.HybridSequential() # Extra non-linear layer self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, flatten=False, weight_initializer=weight_initializer, - bias_initializer=bias_initializer)) + bias_initializer=bias_initializer, + dtype=self.backbone_model.dtype)) self.mlm_decoder.add(get_activation(self.backbone_model.activation)) # use basic layer normalization for pretaining self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) @@ -789,7 +940,8 @@ def __init__(self, backbone_cfg, units=self.backbone_model.vocab_size, in_units=self.backbone_model.embed_size, flatten=False, - bias_initializer=bias_initializer) + bias_initializer=bias_initializer, + dtype=self.backbone_model.dtype) self.embedding_table.weight = self.backbone_model.word_embed.weight if self.backbone_model.embed_size != self.backbone_model.units: self.extra_table = nn.Dense( @@ -798,7 +950,8 @@ def __init__(self, backbone_cfg, self.backbone_model.embed_size, flatten=False, use_bias=False, - bias_initializer=bias_initializer) + bias_initializer=bias_initializer, + dtype=self.backbone_model.dtype) def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): @@ -809,34 +962,47 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - token_types : - Shape (batch_size, seq_length) + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + token_types + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - valid_length : + valid_length The valid length of each sequence Shape (batch_size,) - masked_positions : + masked_positions The masked position of the sequence Shape (batch_size, num_masked_positions). Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) - nsp_score : + nsp_score Shape (batch_size, 2) - mlm_scores : + mlm_scores Shape (batch_size, num_masked_positions, vocab_size) """ contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length) nsp_score = self.nsp_classifier(pooled_out) - mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + if self.backbone_model.layout == 'NT': + mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) + else: + mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1), + masked_positions) intermediate_output = self.mlm_decoder(mlm_features) if self.backbone_model.embed_size != self.backbone_model.units: scores = self.embedding_table( @@ -884,11 +1050,18 @@ def get_pretrained_mobilebert(model_name: str = 'google_uncased_mobilebert', assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format( model_name, list_pretrained_mobilebert()) cfg_path = PRETRAINED_URL[model_name]['cfg'] + if isinstance(cfg_path, CN): + cfg = cfg_path + else: + cfg = None vocab_path = PRETRAINED_URL[model_name]['vocab'] params_path = PRETRAINED_URL[model_name]['params'] mlm_params_path = PRETRAINED_URL[model_name]['mlm_params'] local_paths = dict() - for k, path in [('cfg', cfg_path), ('vocab', vocab_path)]: + download_jobs = [('vocab', vocab_path)] + if cfg is None: + download_jobs.append(('cfg', cfg_path)) + for k, path in download_jobs: local_paths[k] = download(url=get_repo_model_zoo_url() + path, path=os.path.join(root, path), sha1_hash=FILE_STATS[path]) @@ -914,7 +1087,8 @@ def get_pretrained_mobilebert(model_name: str = 'google_uncased_mobilebert', sep_token='[SEP]', mask_token='[MASK]', lowercase=do_lower) - cfg = MobileBertModel.get_cfg().clone_merge(local_paths['cfg']) + if cfg is None: + cfg = MobileBertModel.get_cfg().clone_merge(local_paths['cfg']) return cfg, tokenizer, local_params_path, local_mlm_params_path diff --git a/src/gluonnlp/models/roberta.py b/src/gluonnlp/models/roberta.py index 8400f89fbd..b9af04dafd 100644 --- a/src/gluonnlp/models/roberta.py +++ b/src/gluonnlp/models/roberta.py @@ -42,31 +42,13 @@ from ..layers import PositionalEmbedding, get_activation from ..registry import BACKBONE_REGISTRY from ..utils.misc import download, load_checksum_stats +from ..utils.registry import Registry from .transformer import TransformerEncoderLayer from ..initializer import TruncNorm from ..utils.config import CfgNode as CN from ..attention_cell import gen_self_attn_mask -from ..utils.registry import Registry from ..data.tokenizers import HuggingFaceByteBPETokenizer -PRETRAINED_URL = { - 'fairseq_roberta_base': { - 'cfg': 'fairseq_roberta_base/model-565d1db7.yml', - 'merges': 'fairseq_roberta_base/gpt2-396d4d8e.merges', - 'vocab': 'fairseq_roberta_base/gpt2-f1335494.vocab', - 'params': 'fairseq_roberta_base/model-09a1520a.params', - 'mlm_params': 'fairseq_roberta_base/model_mlm-29889e2b.params', - 'lowercase': False, - }, - 'fairseq_roberta_large': { - 'cfg': 'fairseq_roberta_large/model-6e66dc4a.yml', - 'merges': 'fairseq_roberta_large/gpt2-396d4d8e.merges', - 'vocab': 'fairseq_roberta_large/gpt2-f1335494.vocab', - 'params': 'fairseq_roberta_large/model-6b043b91.params', - 'mlm_params': 'fairseq_roberta_large/model_mlm-119f38e1.params', - 'lowercase': False, - } -} FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'roberta.txt')) roberta_cfg_reg = Registry('roberta_cfg') @@ -90,6 +72,10 @@ def roberta_base(): cfg.MODEL.hidden_dropout_prob = 0.1 cfg.MODEL.attention_dropout_prob = 0.1 cfg.MODEL.dtype = 'float32' + # Layout + cfg.MODEL.layout = 'NT' + cfg.MODEL.compute_layout = 'auto' + # Initialization method cfg.INITIALIZER = CN() cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02] @@ -111,6 +97,97 @@ def roberta_large(): return cfg +PRETRAINED_URL = { + 'fairseq_roberta_base': { + 'cfg': roberta_base(), + 'merges': 'fairseq_roberta_base/gpt2-396d4d8e.merges', + 'vocab': 'fairseq_roberta_base/gpt2-f1335494.vocab', + 'params': 'fairseq_roberta_base/model-09a1520a.params', + 'mlm_params': 'fairseq_roberta_base/model_mlm-29889e2b.params', + 'lowercase': False, + }, + 'fairseq_roberta_large': { + 'cfg': roberta_large(), + 'merges': 'fairseq_roberta_large/gpt2-396d4d8e.merges', + 'vocab': 'fairseq_roberta_large/gpt2-f1335494.vocab', + 'params': 'fairseq_roberta_large/model-6b043b91.params', + 'mlm_params': 'fairseq_roberta_large/model_mlm-119f38e1.params', + 'lowercase': False, + } +} + + +@use_np +class RobertaEncoder(HybridBlock): + def __init__(self, + units=768, + hidden_size=3072, + num_layers=12, + num_heads=12, + attention_dropout_prob=0.1, + hidden_dropout_prob=0.1, + layer_norm_eps=1E-5, + weight_initializer=TruncNorm(stdev=0.02), + bias_initializer='zeros', + activation='gelu', + dtype='float32', + output_all_encodings=False, + output_attention=False, + layout='NT'): + super().__init__() + self.units = units + self.hidden_size = hidden_size + self.num_layers = num_layers + self.num_heads = num_heads + self.attention_dropout_prob = attention_dropout_prob + self.hidden_dropout_prob = hidden_dropout_prob + self.layer_norm_eps = layer_norm_eps + self.activation = activation + self._dtype = dtype + self._layout = layout + self._output_all_encodings = output_all_encodings + self._output_attention = output_attention + self.all_layers = nn.HybridSequential() + for layer_idx in range(self.num_layers): + self.all_layers.add( + TransformerEncoderLayer( + units=self.units, + hidden_size=self.hidden_size, + num_heads=self.num_heads, + attention_dropout_prob=self.attention_dropout_prob, + hidden_dropout_prob=self.hidden_dropout_prob, + layer_norm_eps=self.layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=self.activation, + dtype=self._dtype, + layout=layout) + ) + + @property + def layout(self): + return self._layout + + def hybrid_forward(self, F, x, valid_length): + atten_mask = gen_self_attn_mask(F, x, valid_length, + layout=self._layout, + dtype=self._dtype, attn_type='full') + all_encodings_outputs = [x] + additional_outputs = [] + for layer_idx in range(self.num_layers): + layer = self.all_layers[layer_idx] + x, attention_weights = layer(x, atten_mask) + if self._output_all_encodings: + all_encodings_outputs.append(x) + if self._output_attention: + additional_outputs.append(attention_weights) + # sequence_mask is not necessary here because masking could be performed in downstream tasks + if self._output_all_encodings: + return all_encodings_outputs, additional_outputs + else: + return x, additional_outputs + + @use_np class RobertaModel(HybridBlock): def __init__(self, @@ -133,7 +210,9 @@ def __init__(self, use_pooler=True, classifier_activation=False, encoder_normalize_before=True, - output_all_encodings=False): + output_all_encodings=False, + layout='NT', + compute_layout='auto'): """ Parameters @@ -159,7 +238,13 @@ def __init__(self, classifier_activation Whether to use classification head encoder_normalize_before + Whether to normalize before the output_all_encodings + Whether to output all encodings + layout + The layout + compute_layout + The computation layout """ super().__init__() self._dtype = dtype @@ -181,7 +266,11 @@ def __init__(self, self.encoder_normalize_before = encoder_normalize_before self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer - + self._layout = layout + if compute_layout == 'auto' or compute_layout is None: + self._compute_layout = layout + else: + self._compute_layout = compute_layout self.word_embed = nn.Embedding( input_dim=self.vocab_size, output_dim=self.units, @@ -211,7 +300,8 @@ def __init__(self, bias_initializer=bias_initializer, activation=self.activation, dtype=self._dtype, - output_all_encodings=self._output_all_encodings + output_all_encodings=self._output_all_encodings, + layout=self._compute_layout, ) self.encoder.hybridize() @@ -224,20 +314,26 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer) + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, tokens, valid_length): - outputs = [] embedding = self.get_initial_embedding(F, tokens) - - contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length) - outputs.append(contextual_embeddings) - if self._output_all_encodings: - contextual_embeddings = contextual_embeddings[-1] - + if self._layout != self._compute_layout: + contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(embedding, 0, 1), + valid_length) + contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1) + else: + contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length) if self.use_pooler: - pooled_out = self.apply_pooling(contextual_embeddings) - outputs.append(pooled_out) - - return tuple(outputs) if len(outputs) > 1 else outputs[0] + if isinstance(contextual_embeddings, list): + pooled_out = self.apply_pooling(contextual_embeddings[-1]) + else: + pooled_out = self.apply_pooling(contextual_embeddings) + return contextual_embeddings, pooled_out + else: + return contextual_embeddings def get_initial_embedding(self, F, inputs): """Get the initial token embeddings that considers the token type and positional embeddings @@ -246,17 +342,28 @@ def get_initial_embedding(self, F, inputs): ---------- F inputs - Shape (batch_size, seq_length) + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) Returns ------- embedding The initial embedding that will be fed into the encoder + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) """ + if self._layout == 'NT': + batch_axis, time_axis = 0, 1 + else: + batch_axis, time_axis = 1, 0 embedding = self.word_embed(inputs) if self.pos_embed_type: - positional_embedding = self.pos_embed(F.npx.arange_like(inputs, axis=1)) - positional_embedding = F.np.expand_dims(positional_embedding, axis=0) + positional_embedding = self.pos_embed(F.npx.arange_like(inputs, axis=time_axis)) + positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis) embedding = embedding + positional_embedding if self.encoder_normalize_before: embedding = self.embed_ln(embedding) @@ -270,12 +377,25 @@ def apply_pooling(self, sequence): This is used for pre-training or fine-tuning a mobile bert model. Get the first token of the whole sequence which is [CLS] - sequence: - Shape (batch_size, sequence_length, units) - return: + Parameters + ---------- + sequence + - layout = 'NT' + Shape (batch_size, sequence_length, units) + - layout = 'TN' + Shape (sequence_length, batch_size, units) + + Returns + ------- + ret Shape (batch_size, units) """ - outputs = sequence[:, 0, :] + if self._layout == 'NT': + outputs = sequence[:, 0, :] + elif self._layout == 'TN': + outputs = sequence[0, :, :] + else: + raise NotImplementedError if self.classifier_activation: return self.pooler(outputs) else: @@ -283,7 +403,7 @@ def apply_pooling(self, sequence): @staticmethod def get_cfg(key=None): - if key: + if key is not None: return roberta_cfg_reg.create(key) else: return roberta_base() @@ -292,14 +412,14 @@ def get_cfg(key=None): def from_cfg(cls, cfg, use_pooler=True, - dtype='float32', - classifier_activation=False, - encoder_normalize_before=True, + dtype=None, output_all_encodings=False) -> 'RobertaModel': cfg = RobertaModel.get_cfg().clone_merge(cfg) embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) bias_initializer = mx.init.create(*cfg.INITIALIZER.bias) + if dtype is None: + dtype = cfg.MODEL.dtype return cls(vocab_size=cfg.MODEL.vocab_size, units=cfg.MODEL.units, hidden_size=cfg.MODEL.hidden_size, @@ -317,71 +437,9 @@ def from_cfg(cls, bias_initializer=bias_initializer, dtype=dtype, use_pooler=use_pooler, - encoder_normalize_before=encoder_normalize_before, - output_all_encodings=output_all_encodings) - - -@use_np -class RobertaEncoder(HybridBlock): - def __init__(self, - units=768, - hidden_size=3072, - num_layers=12, - num_heads=12, - attention_dropout_prob=0.1, - hidden_dropout_prob=0.1, - layer_norm_eps=1E-5, - weight_initializer=TruncNorm(stdev=0.02), - bias_initializer='zeros', - activation='gelu', - dtype='float32', - output_all_encodings=False, - output_attention=False): - super().__init__() - self.units = units - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_heads = num_heads - self.attention_dropout_prob = attention_dropout_prob - self.hidden_dropout_prob = hidden_dropout_prob - self.layer_norm_eps = layer_norm_eps - self.activation = activation - self._dtype = dtype - self._output_all_encodings = output_all_encodings - self._output_attention = output_attention - self.all_layers = nn.HybridSequential() - for layer_idx in range(self.num_layers): - self.all_layers.add( - TransformerEncoderLayer( - units=self.units, - hidden_size=self.hidden_size, - num_heads=self.num_heads, - attention_dropout_prob=self.attention_dropout_prob, - hidden_dropout_prob=self.hidden_dropout_prob, - layer_norm_eps=self.layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=self.activation, - dtype=self._dtype) - ) - - def hybrid_forward(self, F, x, valid_length): - atten_mask = gen_self_attn_mask(F, x, valid_length, - dtype=self._dtype, attn_type='full') - all_encodings_outputs = [x] - additional_outputs = [] - for layer_idx in range(self.num_layers): - layer = self.all_layers[layer_idx] - x, attention_weights = layer(x, atten_mask) - if self._output_all_encodings: - all_encodings_outputs.append(x) - if self._output_attention: - additional_outputs.append(attention_weights) - # sequence_mask is not necessary here because masking could be performed in downstream tasks - if self._output_all_encodings: - return all_encodings_outputs, additional_outputs - else: - return x, additional_outputs + output_all_encodings=output_all_encodings, + layout=cfg.MODEL.layout, + compute_layout=cfg.MODEL.compute_layout) @use_np @@ -432,19 +490,25 @@ def hybrid_forward(self, F, inputs, valid_length, masked_positions): Parameters ---------- F - inputs : - Shape (batch_size, seq_length) - valid_length : + inputs + - layout = 'NT' + Shape (batch_size, seq_length) + - layout = 'TN' + Shape (seq_length, batch_size) + valid_length The valid length of each sequence Shape (batch_size,) - masked_positions : + masked_positions The masked position of the sequence Shape (batch_size, num_masked_positions). Returns ------- contextual_embedding - Shape (batch_size, seq_length, units). + - layout = 'NT' + Shape (batch_size, seq_length, units). + - layout = 'TN' + Shape (seq_length, batch_size, units). pooled_out Shape (batch_size, units) mlm_scores : @@ -456,6 +520,8 @@ def hybrid_forward(self, F, inputs, valid_length, masked_positions): contextual_embeddings = all_encodings_outputs[-1] else: contextual_embeddings = all_encodings_outputs + if self.backbone_model.layout == 'TN': + contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1) mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions) mlm_scores = self.mlm_decoder(mlm_features) return all_encodings_outputs, pooled_out, mlm_scores @@ -469,7 +535,7 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base', root: str = get_model_zoo_home_dir(), load_backbone: bool = True, load_mlm: bool = False) \ - -> Tuple[CN, HuggingFaceByteBPETokenizer, str]: + -> Tuple[CN, HuggingFaceByteBPETokenizer, str, str]: """Get the pretrained RoBERTa weights Parameters @@ -497,14 +563,20 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base', assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format( model_name, list_pretrained_roberta()) cfg_path = PRETRAINED_URL[model_name]['cfg'] + if isinstance(cfg_path, CN): + cfg = cfg_path + else: + cfg = None merges_path = PRETRAINED_URL[model_name]['merges'] vocab_path = PRETRAINED_URL[model_name]['vocab'] params_path = PRETRAINED_URL[model_name]['params'] mlm_params_path = PRETRAINED_URL[model_name]['mlm_params'] local_paths = dict() - for k, path in [('cfg', cfg_path), ('vocab', vocab_path), - ('merges', merges_path)]: + download_jobs = [('vocab', vocab_path), ('merges', merges_path)] + if cfg is None: + download_jobs.append(('cfg', cfg_path)) + for k, path in download_jobs: local_paths[k] = download(url=get_repo_model_zoo_url() + path, path=os.path.join(root, path), sha1_hash=FILE_STATS[path]) @@ -526,7 +598,8 @@ def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base', merges_file=local_paths['merges'], vocab_file=local_paths['vocab'], lowercase=do_lower) - cfg = RobertaModel.get_cfg().clone_merge(local_paths['cfg']) + if cfg is None: + cfg = RobertaModel.get_cfg().clone_merge(local_paths['cfg']) return cfg, tokenizer, local_params_path, local_mlm_params_path diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 1d0f7c2eb1..da18447f07 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -1,3 +1,5 @@ +from abc import ABC + import numpy as np import mxnet as mx from mxnet import use_np @@ -31,6 +33,7 @@ def transformer_nmt_base(): cfg.MODEL.attention_dropout = 0.0 cfg.MODEL.activation_dropout = 0.0 cfg.MODEL.dropout = 0.1 + cfg.MODEL.layout = 'NT' cfg.MODEL.dtype = 'float32' # Parameters for the encoder @@ -53,10 +56,6 @@ def transformer_nmt_base(): cfg.MODEL.DECODER.activation = 'relu' cfg.MODEL.DECODER.pre_norm = False - # Parameters for mixture of models - cfg.MODEL.method = 'hMoElp' - cfg.MODEL.num_experts = 3 - # Parameters for the initializer cfg.INITIALIZER = CN() cfg.INITIALIZER.embed = ['xavier', 'gaussian', 'in', 1.0] @@ -141,7 +140,8 @@ def __init__(self, weight_initializer: Optional[InitializerType] = None, bias_initializer: Optional[InitializerType] = 'zeros', activation: str = 'relu', - dtype='float32'): + dtype='float32', + layout='NT'): """ Parameters @@ -165,6 +165,7 @@ def __init__(self, bias_initializer activation dtype + layout """ super().__init__() self._units = units @@ -175,6 +176,9 @@ def __init__(self, self._activation_dropout_prob = activation_dropout_prob self._pre_norm = pre_norm self._dtype = dtype + self._layout = layout + assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \ + 'Only "TN" and "NT" are accepted!'.format(layout) assert self._units % self._num_heads == 0, 'units must be divisive by the number of heads' self.dropout_layer = nn.Dropout(hidden_dropout_prob) self.attn_qkv = nn.Dense(3 * units, @@ -191,6 +195,7 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=self._dtype) + attention_layout = 'NTK' if self._layout == 'NT' else 'TNK' self.attention_cell =\ MultiHeadAttentionCell( query_units=self._units, @@ -198,7 +203,7 @@ def __init__(self, attention_dropout=self._attention_dropout_prob, scaled=True, dtype=self._dtype, - layout='NTK' + layout=attention_layout ) self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, in_channels=units) @@ -213,6 +218,10 @@ def __init__(self, pre_norm=pre_norm, dtype=self._dtype) + @property + def layout(self) -> str: + return self._layout + def hybrid_forward(self, F, data, attn_mask): """ @@ -220,19 +229,23 @@ def hybrid_forward(self, F, data, attn_mask): ---------- F data : - Shape (batch_size, seq_length, C_in) + If layout == 'NT' + Shape (batch_size, seq_length, C_in) + Else + Shape (seq_length, batch_size, C_in) attn_mask : Shape (batch_size, seq_length, seq_length) Returns ------- out : - Shape (batch_size, seq_length, C_out) + If layout == 'NT' + Shape (batch_size, seq_length, C_out) + Else + Shape (seq_length, batch_size, C_out) attn_weight : Shape (batch_size, seq_length, seq_length) """ - # TODO(sxjscience) Cannot use negative axis due to - # https://github.com/apache/incubator-mxnet/issues/18132 if self._pre_norm: data = self.layer_norm(data) query, key, value = F.np.split(self.attn_qkv(data), 3, axis=-1) @@ -256,7 +269,7 @@ def __init__(self, num_layers=6, recurrent=False, activation_dropout=0.0, dropout=0.1, attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False, pre_norm=False, weight_initializer=None, bias_initializer='zeros', - activation='relu', dtype='float32'): + activation='relu', dtype='float32', layout='NT'): """ Parameters @@ -277,6 +290,8 @@ def __init__(self, num_layers=6, recurrent=False, weight_initializer bias_initializer activation + dtype + layout """ super().__init__() self._dtype = dtype @@ -284,6 +299,9 @@ def __init__(self, num_layers=6, recurrent=False, self._recurrent = recurrent self._data_norm = data_norm self._pre_norm = pre_norm + self._layout = layout + assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \ + 'Only "TN" and "NT" are accepted!'.format(layout) self.dropout_layer = nn.Dropout(dropout) if self._pre_norm: self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps, @@ -307,8 +325,13 @@ def __init__(self, num_layers=6, recurrent=False, bias_initializer=bias_initializer, pre_norm=pre_norm, activation=activation, + layout=self._layout, dtype=dtype)) + @property + def layout(self) -> str: + return self._layout + def hybrid_forward(self, F, data, valid_length): """ @@ -316,18 +339,26 @@ def hybrid_forward(self, F, data, valid_length): ---------- F data : - Shape (batch_size, seq_length, C) + - layout = 'NT' + Shape (batch_size, seq_length, C) + - layout = 'TN' + Shape (seq_length, batch_size, C) valid_length : Shape (batch_size,) Returns ------- out : - Shape (batch_size, seq_length, C_out) + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) """ # 1. Embed the data attn_mask = gen_self_attn_mask(F, data, valid_length, - dtype=self._dtype, attn_type='full') + dtype=self._dtype, + layout=self.layout, + attn_type='full') out = self.dropout_layer(data) if self._data_norm: out = self.ln_data(out) @@ -356,7 +387,8 @@ def __init__(self, units: int = 512, pre_norm: bool = False, weight_initializer=None, bias_initializer='zeros', - dtype='float32'): + dtype='float32', + layout='NT'): """ Parameters @@ -377,6 +409,9 @@ def __init__(self, units: int = 512, weight_initializer bias_initializer dtype + Data type + layout + Layout of the input """ super().__init__() self._dtype = dtype @@ -388,6 +423,10 @@ def __init__(self, units: int = 512, self._num_heads = num_heads self._attention_dropout = attention_dropout self._dtype = dtype + self._layout = layout + assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \ + 'Only "TN" and "NT" are accepted!'.format(layout) + attention_layout = 'NTK' if layout == 'NT' else 'TNK' self.dropout_layer = nn.Dropout(dropout) if units % num_heads: raise ValueError('In Transformer, units should be divided exactly by the number of ' @@ -402,7 +441,7 @@ def __init__(self, units: int = 512, num_heads=num_heads, attention_dropout=self._attention_dropout, dtype=dtype, - layout='NTK') + layout=attention_layout) self.proj_in = nn.Dense(units=units, in_units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, @@ -430,7 +469,7 @@ def __init__(self, units: int = 512, num_heads=num_heads, attention_dropout=self._attention_dropout, dtype=dtype, - layout='NTK') + layout=attention_layout) self.proj_inter = nn.Dense(units=units, in_units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, @@ -449,6 +488,10 @@ def __init__(self, units: int = 512, pre_norm=pre_norm, dtype=dtype) + @property + def layout(self) -> str: + return self._layout + def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask): """ @@ -456,9 +499,15 @@ def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask): ---------- F data : - Shape (batch_size, seq_length, C_in) + - layout = 'NT' + Shape (batch_size, seq_length, C_in) + - layout = 'TN' + Shape (seq_length, batch_size, C_in) mem : - Shape (batch_size, mem_length, C_mem) + - layout = 'NT' + Shape (batch_size, mem_length, C_mem) + - layout = 'TN' + Shape (mem_length, batch_size, C_mem) self_causal_mask : Shape (batch_size, seq_length, seq_length) Mask for the causal self-attention. @@ -485,11 +534,11 @@ def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask): Returns ------- out : - Shape (batch_size, seq_length, C_out) + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) """ - # TODO(szhengac) - # Try the architecture in the "[ECCV2016] Identity Mappings in Deep Residual Networks". - # Shuai proposed to switch the order of the activation layer. # 1. Get the causal self-attention value if self._pre_norm: data = self.ln_in(data) @@ -525,22 +574,37 @@ def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask): @property def state_batch_axis(self): - return 0, 0 + if self.layout == 'NT': + return 0, 0 + else: + return 1, 1 def init_states(self, batch_size, ctx, dtype='float32'): """Initialize the states required for incremental decoding Returns ------- - init_key : - Shape (batch_size, 0, N, C_key) + init_key + - layout = 'NT' + Shape (batch_size, 0, N, C_key) + - layout = 'TN' + Shape (0, batch_size, N, C_key) init_value : - Shape (batch_size, 0, N, C_value) + - layout = 'NT' + Shape (batch_size, 0, N, C_value) + - layout = 'TN' + Shape (0, batch_size, N, C_value) """ - init_key = mx.np.zeros(shape=(batch_size, 0, self._num_heads, - self._units // self._num_heads), ctx=ctx, dtype=dtype) - init_value = mx.np.zeros(shape=(batch_size, 0, self._num_heads, - self._units // self._num_heads), ctx=ctx, dtype=dtype) + if self.layout == 'NT': + init_key = mx.np.zeros(shape=(batch_size, 0, self._num_heads, + self._units // self._num_heads), ctx=ctx, dtype=dtype) + init_value = mx.np.zeros(shape=(batch_size, 0, self._num_heads, + self._units // self._num_heads), ctx=ctx, dtype=dtype) + else: + init_key = mx.np.zeros(shape=(0, batch_size, self._num_heads, + self._units // self._num_heads), ctx=ctx, dtype=dtype) + init_value = mx.np.zeros(shape=(0, batch_size, self._num_heads, + self._units // self._num_heads), ctx=ctx, dtype=dtype) return init_key, init_value def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_mask=None): @@ -550,16 +614,25 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma ---------- F data - Shape (batch_size, 1, C_in) + Shape (batch_size, C_in) states The previous states, contains - - prev_multi_key - Shape (batch_size, prev_seq_length, num_heads, C_key) - - prev_multi_value - Shape (batch_size, prev_seq_length, num_heads, C_value) + 1. layout = 'NT': + - prev_multi_key + Shape (batch_size, prev_seq_length, num_heads, C_key) + - prev_multi_value + Shape (batch_size, prev_seq_length, num_heads, C_value) + 2. layout = 'TN' + - prev_multi_key + Shape (prev_seq_length, batch_size, num_heads, C_key) + - prev_multi_value + Shape (prev_seq_length, batch_size, num_heads, C_value) mem The memory - Shape (batch_size, mem_length, C_mem) + 1. layout = 'NT': + Shape (batch_size, mem_length, C_mem) + 2. layout = 'TN' + Shape (mem_length, batch_size, C_mem) mem_valid_length Valid length of the memory Shape (batch_size,) @@ -570,7 +643,7 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma Returns ------- out - Shape (batch_size, 1, C_out) + Shape (batch_size, C_out) updated_states - new_key Shape (batch_size, prev_seq_length + 1, num_heads, C_key) @@ -579,19 +652,28 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma """ if self._pre_norm: data = self.ln_in(data) - prev_key, prev_value = states # Shape (B, prev_L, #Head, C_K), (B, prev_L, #Head, C_V) + if self.layout == 'NT': + time_axis = 1 + else: + time_axis = 0 + data = F.np.expand_dims(data, axis=time_axis) + # Shape (B, prev_L, #Head, C_K), (B, prev_L, #Head, C_V) + # or (prev_L, B, #Head, C_K), (prev_L, B, #Head, C_V) + prev_key, prev_value = states if mem_attn_mask is None: mem_attn_mask = gen_mem_attn_mask(F, mem, mem_valid_length, data, None, - dtype=self._dtype) + dtype=self._dtype, layout=self.layout) # 1. Get the causal self-attention value, we need to attend to both the current data # and the previous stored key/values - step_qkv = self.attn_in_qkv(data) # Shape (B, 1, 3 * num_heads * C_key) + # Shape (B, 1, 3 * num_heads * C_key) + # or (1, B, 3 * num_heads * C_key) + step_qkv = self.attn_in_qkv(data) step_query, step_key, step_value = F.np.split(step_qkv, 3, axis=-1) step_query = F.npx.reshape(step_query, (-2, -2, self._num_heads, -1)) step_key = F.npx.reshape(step_key, (-2, -2, self._num_heads, -1)) step_value = F.npx.reshape(step_value, (-2, -2, self._num_heads, -1)) - new_key = F.np.concatenate([prev_key, step_key], axis=1) - new_value = F.np.concatenate([prev_value, step_value], axis=1) + new_key = F.np.concatenate([prev_key, step_key], axis=time_axis) + new_value = F.np.concatenate([prev_value, step_value], axis=time_axis) out, _ = self.self_attention(step_query, new_key, new_value, None) out = self.proj_in(out) out = self.dropout_layer(out) @@ -616,6 +698,7 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_ma out = self.ln_inter(out) # 3. Encode the output via an FFN layer out = self.ffn(out) + out = F.npx.reshape(out, (-5, -1)) return out, (new_key, new_value) @@ -626,7 +709,8 @@ def __init__(self, num_layers=6, recurrent=False, num_heads=8, max_shift=None, rel_pos_embed=False, activation_dropout=0.0, dropout=0.1, attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False, pre_norm=False, weight_initializer=None, bias_initializer=None, - activation='relu', dtype='float32'): + activation='relu', dtype='float32', + layout='NT'): super().__init__() self._dtype = dtype self._units = units @@ -637,6 +721,9 @@ def __init__(self, num_layers=6, recurrent=False, self.rel_pos_embed = rel_pos_embed self._data_norm = data_norm self._pre_norm = pre_norm + self._layout = layout + assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \ + 'Only "TN" and "NT" are accepted!'.format(layout) self.dropout_layer = nn.Dropout(dropout) if self._data_norm: self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps, @@ -660,35 +747,53 @@ def __init__(self, num_layers=6, recurrent=False, bias_initializer=bias_initializer, activation=activation, pre_norm=pre_norm, + layout=layout, dtype=dtype)) + @property + def layout(self) -> str: + return self._layout + def hybrid_forward(self, F, data, valid_length, mem_data, mem_valid_length): """ Parameters ---------- F - data : - Shape (batch_size, seq_length, C_in) - valid_length : + data + - layout = 'NT' + Shape (batch_size, seq_length, C_in) + - layout = 'TN' + Shape (seq_length, batch_size, C_in) + valid_length Shape (batch_size,) - mem_data : - Shape (batch_size, mem_length, C_mem) - mem_valid_length : + mem_data + - layout = 'NT' + Shape (batch_size, mem_length, C_mem) + - layout = 'TN' + Shape (mem_length, batch_size, C_mem) + mem_valid_length Shape (batch_size,) + Returns ------- - out : - Shape (batch_size, seq_length, C_out) + out + - layout = 'NT' + Shape (batch_size, seq_length, C_out) + - layout = 'TN' + Shape (seq_length, batch_size, C_out) """ # 1. Embed the data out = self.dropout_layer(data) if self._data_norm: out = self.ln_data(out) self_causal_mask = gen_self_attn_mask(F, data, valid_length, - dtype=self._dtype, attn_type='causal') + dtype=self._dtype, + attn_type='causal', + layout=self._layout) mem_attn_mask = gen_mem_attn_mask(F, mem_data, mem_valid_length, data, valid_length, - dtype=self._dtype) + dtype=self._dtype, + layout=self._layout) for i in range(self.num_layers): if self.recurrent: layer = self.layers[0] @@ -710,15 +815,19 @@ def state_batch_axis(self): ret.append(layer.state_batch_axis) return ret - def init_states(self, batch_size, ctx, dtype): + def init_states(self, batch_size, ctx, dtype='float32'): """Initialize the states required for incremental decoding Returns ------- - init_key : - Shape (batch_size, 0, N, C_key) - init_value : - Shape (batch_size, 0, N, C_value) + states + A list of states, each includes: + - init_key : + layout = 'NT': + Shape (batch_size, 0, N, C_key) + - init_value : + layout = 'TN': + Shape (0, batch_size, N, C_value) """ states = [] for i in range(self.num_layers): @@ -738,16 +847,25 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length): ---------- F data - Shape (batch_size, 1, C_in) + Shape (batch_size, C_in) states The previous states, contain a list of - - prev_multi_key - Shape (batch_size, prev_seq_length, num_heads, C_key) - - prev_multi_value - Shape (batch_size, prev_seq_length, num_heads, C_value) + 1. layout = 'NT' + - prev_multi_key + Shape (batch_size, prev_seq_length, num_heads, C_key) + - prev_multi_value + Shape (batch_size, prev_seq_length, num_heads, C_value) + 2. layout = 'TN' + - prev_multi_key + Shape (prev_seq_length, batch_size, num_heads, C_key) + - prev_multi_value + Shape (prev_seq_length, batch_size, num_heads, C_value) mem The memory - Shape (batch_size, mem_length, C_mem) + 1. layout = 'NT' + Shape (batch_size, mem_length, C_mem) + 2. layout = 'TN' + Shape (mem_length, batch_size, C_mem) mem_valid_length Valid length of the memory Shape (batch_size,) @@ -755,20 +873,27 @@ def incremental_decode(self, F, data, states, mem, mem_valid_length): Returns ------- out - Shape (batch_size, 1, C_out) + Shape (batch_size, C_out) new_states The updated states, contain a list of - - new_key - Shape (batch_size, prev_seq_length + 1, num_heads, C_key) - - new_value - Shape (batch_size, prev_seq_length + 1, num_heads, C_value) + 1. layout = 'NT' + - new_key + Shape (batch_size, prev_seq_length + 1, num_heads, C_key) + 2. layout = 'TN' + - new_value + Shape (prev_seq_length + 1, batch_size, num_heads, C_value) """ # 1. Embed the data out = self.dropout_layer(data) if self._data_norm: out = self.ln_data(out) - mem_attn_mask = gen_mem_attn_mask(F, mem, mem_valid_length, data, None, - dtype=self._dtype) + time_axis = 0 if self.layout == 'TN' else 1 + # Generate the mem_attn_mask + time_steps = F.npx.arange_like(mem, axis=time_axis) # (mem_length,) + mem_attn_mask = F.np.reshape(time_steps, (1, 1, -1))\ + < F.np.reshape(mem_valid_length, (-1, 1, 1)) + # TODO(sxjscience) Try with boolean masking + mem_attn_mask = mem_attn_mask.astype(self._dtype) new_states = [] for i in range(self.num_layers): if self.recurrent: @@ -815,7 +940,8 @@ def __init__(self, src_vocab_size: int, embed_initializer=mx.init.Xavier('gaussian', 'in', 1), weight_initializer=mx.init.Xavier('uniform', 'avg', 3), bias_initializer='zeros', - dtype='float32'): + dtype='float32', + layout='NT'): """ Parameters @@ -884,6 +1010,8 @@ def __init__(self, src_vocab_size: int, Initializer of the bias dtype Data type of the weights + layout + The layout of the input + target """ super().__init__() assert src_vocab_size > 0 and tgt_vocab_size > 0,\ @@ -900,6 +1028,9 @@ def __init__(self, src_vocab_size: int, self.scaled_embed = scale_embed self.enc_units = enc_units self.dec_units = dec_units + self._layout = layout + assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \ + 'Only "TN" and "NT" are accepted!'.format(layout) if max_src_length is not None and max_src_length < 0: max_src_length = None if max_tgt_length is not None and max_tgt_length < 0: @@ -941,7 +1072,8 @@ def __init__(self, src_vocab_size: int, activation=enc_activation, data_norm=data_norm, pre_norm=enc_pre_norm, - dtype=self._dtype) + dtype=self._dtype, + layout=layout) self.decoder = TransformerDecoder(num_layers=dec_num_layers, recurrent=dec_recurrent, units=dec_units, @@ -957,7 +1089,8 @@ def __init__(self, src_vocab_size: int, activation=dec_activation, data_norm=data_norm, pre_norm=dec_pre_norm, - dtype=self._dtype) + dtype=self._dtype, + layout=layout) if tie_weights: self.tgt_final_layer =\ nn.Dense(tgt_vocab_size, flatten=False, @@ -976,6 +1109,10 @@ def __init__(self, src_vocab_size: int, self.encoder.hybridize() self.decoder.hybridize() + @property + def layout(self) -> str: + return self._layout + @property def src_vocab_size(self): return self._src_vocab_size @@ -992,21 +1129,31 @@ def encode(self, F, src_data, src_valid_length): Parameters ---------- F - src_data : - Shape (batch_size, src_length) - src_valid_length : + src_data + - layout = 'NT' + Shape (batch_size, src_length) + - layout = 'TN' + Shape (src_length, batch_size) + src_valid_length Shape (batch_size,) Returns ------- - enc_out : - Shape (batch_size, src_length, C_out) + enc_out + - layout = 'NT' + Shape (batch_size, src_length, C_out) + - layout = 'TN' + Shape (src_length, batch_size, C_out) """ src_data = self.src_embed_layer(src_data) if self.scaled_embed: src_data = src_data * np.sqrt(self.enc_units) if self.pos_embed_type is not None: - src_data = src_data + self.src_pos_embed_layer(F.npx.arange_like(src_data, axis=1)) + if self.layout == 'NT': + src_data = src_data + self.src_pos_embed_layer(F.npx.arange_like(src_data, axis=1)) + else: + src_data = src_data + F.np.expand_dims(self.src_pos_embed_layer( + F.npx.arange_like(src_data, axis=0)), axis=1) enc_out = self.encoder(src_data, src_valid_length) return enc_out @@ -1016,26 +1163,39 @@ def decode_seq(self, F, tgt_data, tgt_valid_length, mem_data, mem_valid_length): Parameters ---------- F - tgt_data : - Shape (batch_size, tgt_length) - tgt_valid_length : + tgt_data + - layout = 'NT' + Shape (batch_size, tgt_length) + - layout = 'TN' + Shape (tgt_length, batch_size) + tgt_valid_length Shape (batch_size,) - mem_data : - Shape (batch_size, src_length, C_out) + mem_data + - layout = 'NT' + Shape (batch_size, src_length, C_out) + - layout = 'TN' + Shape (src_length, batch_size, C_out) mem_valid_length : Shape (batch_size,) Returns ------- - dec_out : - Shape (batch_size, tgt_length, tgt_vocab_size) + dec_out + - layout = 'NT' + Shape (batch_size, tgt_length, tgt_vocab_size) + - layout = 'TN' + Shape (tgt_length, batch_size, tgt_vocab_size) """ tgt_data = self.tgt_embed_layer(tgt_data) if self.scaled_embed: tgt_data = tgt_data * np.sqrt(self.dec_units) if self.pos_embed_type is not None: - tgt_data = tgt_data + self.tgt_pos_embed_layer( - F.npx.arange_like(tgt_data, axis=1)) + if self.layout == 'NT': + tgt_data = tgt_data + self.tgt_pos_embed_layer( + F.npx.arange_like(tgt_data, axis=1)) + else: + tgt_data = tgt_data + F.np.expand_dims(self.tgt_pos_embed_layer( + F.npx.arange_like(tgt_data, axis=0)), axis=1) dec_out = self.decoder(tgt_data, tgt_valid_length, mem_data, mem_valid_length) dec_out = self.tgt_final_layer(dec_out) return dec_out @@ -1046,19 +1206,28 @@ def hybrid_forward(self, F, src_data, src_valid_length, tgt_data, tgt_valid_leng Parameters ---------- F - src_data : - Shape (batch_size, src_length) - src_valid_length : + src_data + - layout = 'NT' + Shape (batch_size, src_length) + - layout = 'TN' + Shape (src_length, batch_size) + src_valid_length Shape (batch_size,) - tgt_data : - Shape (batch_size, tgt_length) - tgt_valid_length : + tgt_data + - layout = 'NT' + Shape (batch_size, tgt_length) + - layout = 'TN' + Shape (tgt_length, batch_size) + tgt_valid_length Shape (batch_size,) Returns ------- - out : - Shape (batch_size, tgt_length, tgt_vocab_size) + out + - layout = 'NT' + Shape (batch_size, tgt_length, tgt_vocab_size) + - layout = 'TN' + Shape (tgt_length, batch_size, tgt_vocab_size) """ enc_out = self.encode(F, src_data, src_valid_length) dec_out = self.decode_seq(F, tgt_data, tgt_valid_length, enc_out, src_valid_length) @@ -1073,11 +1242,13 @@ def get_cfg(cls, key=None): return transformer_nmt_cfg_reg.create(key) @classmethod - def from_cfg(cls, cfg): + def from_cfg(cls, cfg, dtype=None): cfg = cls.get_cfg().clone_merge(cfg) embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) bias_initializer = mx.init.create(*cfg.INITIALIZER.bias) + if dtype is None: + dtype = cfg.MODEL.dtype return cls(src_vocab_size=cfg.MODEL.src_vocab_size, tgt_vocab_size=cfg.MODEL.tgt_vocab_size, max_src_length=cfg.MODEL.max_src_length, @@ -1103,10 +1274,11 @@ def from_cfg(cls, cfg): dec_recurrent=cfg.MODEL.DECODER.recurrent, dec_activation=cfg.MODEL.DECODER.activation, dec_pre_norm=cfg.MODEL.DECODER.pre_norm, + layout=cfg.MODEL.layout, embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - dtype=cfg.MODEL.dtype) + dtype=dtype) @use_np @@ -1140,33 +1312,45 @@ def state_batch_axis(self) -> Tuple[int, int, int, List]: position_batch_axis : int dec_layer_batch_axis : list """ - return 0, 0, 0, self.model.decoder.state_batch_axis + if self.model.layout == 'NT': + return 0, 0, 0, self.model.decoder.state_batch_axis + else: + return 1, 0, 0, self.model.decoder.state_batch_axis def init_states(self, src_data, src_valid_length): # TODO(sxjscience) Revisit here, support auxiliary states? """Initialize the states required for sequence sampling Parameters ---------- - src_data : - Shape (batch_size, src_length) - src_valid_length : + src_data + - layout = 'NT' + Shape (batch_size, src_length) + - layout = 'TN' + Shape (src_length, batch_size) + src_valid_length Shape (batch_size,) Returns ------- - enc_out : - Shape (batch_size, src_length, C_mem) - src_valid_length : + enc_out + - layout = 'NT' + Shape (batch_size, src_length, C_mem) + - layout = 'TN' + Shape (src_length, batch_size, C_mem) + src_valid_length Shape (batch_size,) - position : + position Shape (batch_size,) dec_states: list The states of the decoder """ - batch_size = src_data.shape[0] + if self.model.layout == 'NT': + batch_size = src_data.shape[0] + else: + batch_size = src_data.shape[1] ctx = src_data.ctx - enc_out = self.model.encode(mx.nd, src_data, src_valid_length) - position = mx.np.zeros((batch_size, 1), dtype=np.int32, ctx=ctx) + enc_out = self.model.encode(mx, src_data, src_valid_length) + position = mx.np.zeros((batch_size,), dtype=np.int32, ctx=ctx) dtype = enc_out.dtype dec_states = self.model.decoder.init_states(batch_size, ctx, dtype) return enc_out, src_valid_length, position, dec_states @@ -1176,24 +1360,29 @@ def hybrid_forward(self, F, step_data, states): Parameters ---------- - step_data : + step_data Shape (batch_size,) - states : tuple + states It includes : - mem_data : (batch_size, src_length, C_mem) - mem_valid_length : (batch_size,) - position : (batch_size,) - dec_states : list + - layout = 'NT' + mem_data : (batch_size, src_length, C_mem) + mem_valid_length : (batch_size,) + position : (batch_size,) + dec_states : list + - layout = 'TN' + mem_data : (src_length, batch_size, C_mem) + mem_valid_length : (batch_size,) + position : (batch_size,) + dec_states : list Returns ------- - out : + out Shape (batch_size, C) - new_states : tuple + new_states Has the same structure as the states """ mem_data, mem_valid_length, position, dec_states = states # 1. Get the embedding - step_data = F.np.expand_dims(step_data, axis=1) step_data = self.model.tgt_embed_layer(step_data) if self.model.scaled_embed: step_data = step_data * np.sqrt(self.model.dec_units) @@ -1203,5 +1392,4 @@ def hybrid_forward(self, F, step_data, states): self.model.decoder.incremental_decode(F, step_data, dec_states, mem_data, mem_valid_length) out = self.model.tgt_final_layer(out) - out = F.npx.reshape(out, (-2, -1)) return out, (mem_data, mem_valid_length, position + 1, new_states) diff --git a/src/gluonnlp/models/transformer_xl.py b/src/gluonnlp/models/transformer_xl.py index a232ec8c37..b6ff44c5df 100644 --- a/src/gluonnlp/models/transformer_xl.py +++ b/src/gluonnlp/models/transformer_xl.py @@ -81,6 +81,10 @@ def __init__(self, units: int = 512, pre_norm=pre_norm, dtype=dtype) + @property + def layout(self): + return self._layout + def hybrid_forward(self, F, data, mem, rel_positions, mask, query_r_bias, query_k_bias): """ @@ -118,7 +122,10 @@ def hybrid_forward(self, F, data, mem, rel_positions, mask, query_r_bias, query_ Returns ------- out - Shape (batch_size, query_length, units) + - layout = 'NT' + Shape (batch_size, query_length, units) + - layout = 'TN' + Shape (query_length, batch_size, units) """ if self._layout == 'NT': context = F.np.concatenate([mem, data], axis=1) diff --git a/src/gluonnlp/models/xlmr.py b/src/gluonnlp/models/xlmr.py index b433d34157..66a3784557 100644 --- a/src/gluonnlp/models/xlmr.py +++ b/src/gluonnlp/models/xlmr.py @@ -39,23 +39,6 @@ from ..data.tokenizers import SentencepieceTokenizer -PRETRAINED_URL = { - 'fairseq_xlmr_base': { - 'cfg': 'fairseq_xlmr_base/model-b893d178.yml', - 'sentencepiece.model': 'fairseq_xlmr_base/sentencepiece-18e17bae.model', - 'params': 'fairseq_xlmr_base/model-3fa134e9.params', - 'mlm_params': 'fairseq_xlmr_base/model_mlm-86e37954.params', - 'lowercase': False, - }, - 'fairseq_xlmr_large': { - 'cfg': 'fairseq_xlmr_large/model-01fc59fb.yml', - 'sentencepiece.model': 'fairseq_xlmr_large/sentencepiece-18e17bae.model', - 'params': 'fairseq_xlmr_large/model-b62b074c.params', - 'mlm_params': 'fairseq_xlmr_large/model_mlm-887506c2.params', - 'lowercase': False, - } -} - FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'xlmr.txt')) xlmr_cfg_reg = Registry('xlmr_cfg') @@ -86,10 +69,31 @@ def get_cfg(key=None): return xlmr_cfg_reg.create(key) else: return xlmr_base() + + +PRETRAINED_URL = { + 'fairseq_xlmr_base': { + 'cfg': xlmr_base(), + 'sentencepiece.model': 'fairseq_xlmr_base/sentencepiece-18e17bae.model', + 'params': 'fairseq_xlmr_base/model-3fa134e9.params', + 'mlm_params': 'fairseq_xlmr_base/model_mlm-86e37954.params', + 'lowercase': False, + }, + 'fairseq_xlmr_large': { + 'cfg': xlmr_large(), + 'sentencepiece.model': 'fairseq_xlmr_large/sentencepiece-18e17bae.model', + 'params': 'fairseq_xlmr_large/model-b62b074c.params', + 'mlm_params': 'fairseq_xlmr_large/model_mlm-887506c2.params', + 'lowercase': False, + } +} + + @use_np class XLMRForMLM(RobertaForMLM): pass + def list_pretrained_xlmr(): return sorted(list(PRETRAINED_URL.keys())) @@ -98,7 +102,7 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base', root: str = get_model_zoo_home_dir(), load_backbone: bool = True, load_mlm: bool = False) \ - -> Tuple[CN, SentencepieceTokenizer, str]: + -> Tuple[CN, SentencepieceTokenizer, str, str]: """Get the pretrained XLM-R weights Parameters @@ -126,11 +130,18 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base', assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format( model_name, list_pretrained_xlmr()) cfg_path = PRETRAINED_URL[model_name]['cfg'] + if isinstance(cfg_path, CN): + cfg = cfg_path + else: + cfg = None sp_model_path = PRETRAINED_URL[model_name]['sentencepiece.model'] params_path = PRETRAINED_URL[model_name]['params'] mlm_params_path = PRETRAINED_URL[model_name]['mlm_params'] local_paths = dict() - for k, path in [('cfg', cfg_path), ('sentencepiece.model', sp_model_path)]: + download_jobs = [('sentencepiece.model', sp_model_path)] + if cfg is None: + download_jobs.append(('cfg', cfg_path)) + for k, path in download_jobs: local_paths[k] = download(url=get_repo_model_zoo_url() + path, path=os.path.join(root, path), sha1_hash=FILE_STATS[path]) @@ -152,7 +163,8 @@ def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base', tokenizer = SentencepieceTokenizer( model_path=local_paths['sentencepiece.model'], lowercase=do_lower) - cfg = XLMRModel.get_cfg().clone_merge(local_paths['cfg']) + if cfg is None: + cfg = XLMRModel.get_cfg().clone_merge(local_paths['cfg']) return cfg, tokenizer, local_params_path, local_mlm_params_path diff --git a/src/gluonnlp/utils/testing.py b/src/gluonnlp/utils/testing.py index 00b2d4901d..abae1a804e 100644 --- a/src/gluonnlp/utils/testing.py +++ b/src/gluonnlp/utils/testing.py @@ -3,19 +3,56 @@ from mxnet.util import use_np +def is_match_states_batch_size(states, states_batch_axis, batch_size) -> bool: + """Test whether the generated states have the specified batch size + + Parameters + ---------- + states + The states structure + states_batch_axis + The states batch axis structure + batch_size + The batch size + + Returns + ------- + ret + """ + if states_batch_axis is None: + return True + if isinstance(states_batch_axis, int): + if states.shape[states_batch_axis] == batch_size: + return True + for ele_states_batch_axis, ele_states in zip(states_batch_axis, states): + ret = is_match_states_batch_size(ele_states, ele_states_batch_axis, batch_size) + if ret is False: + return False + return True + + @use_np -def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10, - atol=1E-5, rtol=1E-5): +def verify_nmt_model(model, batch_size: int = 4, + src_seq_length: int = 5, + tgt_seq_length: int = 10, + atol: float = 1E-4, + rtol: float = 1E-4): """Verify the correctness of an NMT model. Raise error message if it detects problems. Parameters ---------- - model : - batch_size : - src_seq_length : - tgt_seq_length : - atol : - rtol : + model + The machine translation model + batch_size + The batch size to test the nmt model + src_seq_length + Length of the source sequence + tgt_seq_length + Length of the target sequence + atol + Absolute tolerance. + rtol + Relative tolerance. """ src_word_sequence = mx.np.random.randint(0, model.src_vocab_size, (batch_size, src_seq_length)) @@ -23,7 +60,13 @@ def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10, src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,)) min_tgt_seq_length = max(1, tgt_seq_length - 5) tgt_valid_length = mx.np.random.randint(min_tgt_seq_length, tgt_seq_length, (batch_size,)) - full_out = model(src_word_sequence, src_valid_length, tgt_word_sequence, tgt_valid_length) + + if model.layout == 'NT': + full_out = model(src_word_sequence, src_valid_length, tgt_word_sequence, tgt_valid_length) + else: + full_out = model(src_word_sequence.T, src_valid_length, + tgt_word_sequence.T, tgt_valid_length) + full_out = mx.np.swapaxes(full_out, 0, 1) if full_out.shape != (batch_size, tgt_seq_length, model.tgt_vocab_size): raise AssertionError('The output of NMT model does not match the expected output.' ' Model output shape = {}, Expected (B, T, V) = {}' @@ -31,11 +74,19 @@ def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10, (batch_size, tgt_seq_length, model.tgt_vocab_size))) for partial_batch_size in range(1, batch_size + 1): for i in range(1, min_tgt_seq_length): - partial_out = model(src_word_sequence[:partial_batch_size, :], - src_valid_length[:partial_batch_size], - tgt_word_sequence[:partial_batch_size, :(-i)], - tgt_valid_length[:partial_batch_size] - - mx.np.array(i, dtype=tgt_valid_length.dtype)) + if model.layout == 'NT': + partial_out = model(src_word_sequence[:partial_batch_size, :], + src_valid_length[:partial_batch_size], + tgt_word_sequence[:partial_batch_size, :(-i)], + tgt_valid_length[:partial_batch_size] + - mx.np.array(i, dtype=tgt_valid_length.dtype)) + else: + partial_out = model(src_word_sequence[:partial_batch_size, :].T, + src_valid_length[:partial_batch_size], + tgt_word_sequence[:partial_batch_size, :(-i)].T, + tgt_valid_length[:partial_batch_size] + - mx.np.array(i, dtype=tgt_valid_length.dtype)) + partial_out = mx.np.swapaxes(partial_out, 0, 1) # Verify that the partial output matches the full output for b in range(partial_batch_size): partial_vl = tgt_valid_length.asnumpy()[b] - i @@ -45,37 +96,66 @@ def verify_nmt_model(model, batch_size=4, src_seq_length=5, tgt_seq_length=10, @use_np def verify_nmt_inference(train_model, inference_model, - batch_size=4, src_seq_length=5, tgt_seq_length=10, atol=1E-5, rtol=1E-5): + batch_size=4, src_seq_length=5, + tgt_seq_length=10, atol=1E-4, rtol=1E-4): """Verify the correctness of an NMT inference model. Raise error message if it detects any problems. Parameters ---------- - train_model : - inference_model : - batch_size : - src_seq_length : - tgt_seq_length : - atol : - rtol : + train_model + inference_model + batch_size + src_seq_length + tgt_seq_length + atol + Absolute tolerance + rtol + Relative tolerance """ - src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size, - (batch_size, src_seq_length)) - tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size, - (batch_size, tgt_seq_length)) + if train_model.layout == 'NT': + src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size, + (batch_size, src_seq_length)) + tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size, + (batch_size, tgt_seq_length)) + else: + src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size, + (src_seq_length, batch_size)) + tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size, + (tgt_seq_length, batch_size)) src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,)) min_tgt_seq_length = max(1, tgt_seq_length - 5) tgt_valid_length = mx.np.random.randint(min_tgt_seq_length, tgt_seq_length, (batch_size,)) full_out = train_model(src_word_sequences, src_valid_length, tgt_word_sequences, tgt_valid_length) - for partial_batch_size in range(1, batch_size + 1): - step_out_l = [] - states = inference_model.init_states(src_word_sequences[:partial_batch_size, :], - src_valid_length[:partial_batch_size]) - for i in range(min_tgt_seq_length): - step_out, states = inference_model(tgt_word_sequences[:partial_batch_size, i], states) - step_out_l.append(step_out) - partial_out = mx.np.stack(step_out_l, axis=1) - npt.assert_allclose(full_out[:partial_batch_size, :min_tgt_seq_length].asnumpy(), - partial_out[:partial_batch_size, :].asnumpy(), atol, rtol) + if train_model.layout == 'NT': + for partial_batch_size in range(1, batch_size + 1): + step_out_l = [] + states = inference_model.init_states(src_word_sequences[:partial_batch_size, :], + src_valid_length[:partial_batch_size]) + assert is_match_states_batch_size(states, inference_model.state_batch_axis, + partial_batch_size) + for i in range(min_tgt_seq_length): + step_out, states = inference_model(tgt_word_sequences[:partial_batch_size, i], + states) + step_out_l.append(step_out) + partial_out = mx.np.stack(step_out_l, axis=1) + npt.assert_allclose(full_out[:partial_batch_size, :min_tgt_seq_length].asnumpy(), + partial_out[:partial_batch_size, :].asnumpy(), atol, rtol) + elif train_model.layout == 'TN': + for partial_batch_size in range(1, batch_size + 1): + step_out_l = [] + states = inference_model.init_states(src_word_sequences[:, :partial_batch_size], + src_valid_length[:partial_batch_size]) + assert is_match_states_batch_size(states, inference_model.state_batch_axis, + partial_batch_size) + for i in range(min_tgt_seq_length): + step_out, states = inference_model(tgt_word_sequences[i, :partial_batch_size], + states) + step_out_l.append(step_out) + partial_out = mx.np.stack(step_out_l, axis=0) + npt.assert_allclose(full_out[:min_tgt_seq_length, :partial_batch_size].asnumpy(), + partial_out[:, :partial_batch_size].asnumpy(), atol, rtol) + else: + raise NotImplementedError diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py index 489f566beb..3b874b0d55 100644 --- a/tests/test_attention_cell.py +++ b/tests/test_attention_cell.py @@ -173,23 +173,27 @@ def test_dot_product_attention(scaled, normalized): @pytest.mark.seed(123) def test_gen_attn_mask(): class GenSelfAttnMask(HybridBlock): - def __init__(self, dtype, attn_type): + def __init__(self, dtype, layout, attn_type): super().__init__() self._dtype = dtype + self._layout = layout self._attn_type = attn_type def hybrid_forward(self, F, data, valid_length): return gen_self_attn_mask(F, data, valid_length, - dtype=self._dtype, attn_type=self._attn_type) + dtype=self._dtype, + layout=self._layout, + attn_type=self._attn_type) class GenMemAttnMask(HybridBlock): - def __init__(self, dtype): + def __init__(self, dtype, layout): super().__init__() self._dtype = dtype + self._layout = layout def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length, - dtype=self._dtype) + dtype=self._dtype, layout=self._layout) batch_size = 4 query_length = 8 @@ -203,11 +207,17 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): for hybridize in [False, True]: # Test Full Attention Mask - mask_gen = GenSelfAttnMask(dtype=np.float32, attn_type='full') + mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full') + mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full') if hybridize: - mask_gen.hybridize() - mask = mask_gen(data, valid_length) - mask = mask.asnumpy() + mask_gen_nt.hybridize() + mask_gen_tn.hybridize() + mask_nt = mask_gen_nt(data, valid_length) + mask_nt = mask_nt.asnumpy() + mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length) + mask_tn = mask_tn.asnumpy() + mask = mask_nt + assert_allclose(mask_nt, mask_tn) for b in range(batch_size): v_l = valid_length.asnumpy()[b] for i in range(v_l): @@ -217,11 +227,15 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): assert (mask[b, i, :] == 0).all() # Test Causal Attention Mask - mask_gen = GenSelfAttnMask(dtype=np.float32, attn_type='causal') + mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal') + mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal') if hybridize: - mask_gen.hybridize() - mask = mask_gen(data, valid_length) - mask = mask.asnumpy() + mask_gen_nt.hybridize() + mask_gen_tn.hybridize() + mask_nt = mask_gen_nt(data, valid_length) + mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length) + assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy()) + mask = mask_nt.asnumpy() for b in range(batch_size): v_l = valid_length.asnumpy()[b] for i in range(v_l): @@ -231,11 +245,16 @@ def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): assert (mask[b, i, :] == 0).all() # Test Mem Attention Mask - mask_gen = GenMemAttnMask(dtype=np.float32) + mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT') + mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN') if hybridize: - mask_gen.hybridize() - mask = mask_gen(mem, mem_valid_length, data, valid_length) - mask = mask.asnumpy() + mask_gen_nt.hybridize() + mask_gen_tn.hybridize() + mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length) + mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length, + mx.np.swapaxes(data, 0, 1), valid_length) + mask = mask_nt.asnumpy() + assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy()) for b in range(batch_size): data_v_l = valid_length.asnumpy()[b] mem_v_l = mem_valid_length.asnumpy()[b] diff --git a/tests/test_models_albert.py b/tests/test_models_albert.py index 2fd7bbdba5..f428a85569 100644 --- a/tests/test_models_albert.py +++ b/tests/test_models_albert.py @@ -30,17 +30,36 @@ def get_test_cfg(): return cfg -def test_albert_backbone(): +@pytest.mark.parametrize('static_alloc,static_shape', [(False, False), + (True, True)]) +@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) +def test_albert_backbone(static_alloc, static_shape, compute_layout): batch_size = 3 cfg = get_test_cfg() + cfg.defrost() + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() model = AlbertModel.from_cfg(cfg, use_pooler=True) model.initialize() - model.hybridize(static_alloc=True, static_shape=True) + model.hybridize(static_alloc=static_alloc, static_shape=static_shape) + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + model_tn = AlbertModel.from_cfg(cfg_tn, use_pooler=True) + model_tn.share_parameters(model.collect_params()) + model_tn.hybridize(static_alloc=static_alloc, static_shape=static_shape) + for seq_length in [64, 96]: valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,)) inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length)) token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length)) contextual_embedding, pooled_out = model(inputs, token_types, valid_length) + contextual_embedding_tn, pooled_out_tn = model_tn(inputs.T, token_types.T, valid_length) + # Verify layout + assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + contextual_embedding.asnumpy(), 1E-4, 1E-4) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) assert contextual_embedding.shape == (batch_size, seq_length, cfg.MODEL.units) assert pooled_out.shape == (batch_size, cfg.MODEL.units) # Ensure the embeddings that exceed valid_length are masked @@ -65,35 +84,72 @@ def test_albert_backbone(): assert_allclose(new_pooled_out_np, pooled_out_np, 1E-4, 1E-4) -def test_albert_for_mlm_model(): +@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) +def test_albert_for_mlm_model(compute_layout): batch_size = 3 cfg = get_test_cfg() + cfg.defrost() + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() albert_mlm_model = AlbertForMLM(backbone_cfg=cfg) albert_mlm_model.initialize() albert_mlm_model.hybridize() + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + albert_mlm_tn_model = AlbertForMLM(backbone_cfg=cfg_tn) + albert_mlm_tn_model.share_parameters(albert_mlm_model.collect_params()) + albert_mlm_tn_model.hybridize() + num_mask = 16 seq_length = 64 inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length)) token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length)) valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,)) masked_positions = mx.np.random.randint(0, seq_length // 2, (batch_size, num_mask)) - _, _, mlm_scores = albert_mlm_model(inputs, token_types, valid_length, masked_positions) + contextual_embeddings, pooled_out, mlm_scores = albert_mlm_model(inputs, token_types, valid_length, masked_positions) + contextual_embeddings_tn, pooled_out_tn, mlm_scores_tn = albert_mlm_tn_model(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), + contextual_embeddings.asnumpy(), 1E-4, 1E-4) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4) assert mlm_scores.shape == (batch_size, num_mask, cfg.MODEL.vocab_size) -def test_albert_for_pretrain_model(): +@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) +def test_albert_for_pretrain_model(compute_layout): batch_size = 3 cfg = get_test_cfg() + cfg.defrost() + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() albert_pretrain_model = AlbertForPretrain(backbone_cfg=cfg) albert_pretrain_model.initialize() albert_pretrain_model.hybridize() + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + albert_pretrain_model_tn = AlbertForPretrain(backbone_cfg=cfg_tn) + albert_pretrain_model_tn.share_parameters(albert_pretrain_model.collect_params()) + albert_pretrain_model_tn.hybridize() + num_mask = 16 seq_length = 64 inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length)) token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length)) valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,)) masked_positions = mx.np.random.randint(0, seq_length // 2, (batch_size, num_mask)) - _, _, sop_score, mlm_scores = albert_pretrain_model(inputs, token_types, valid_length, masked_positions) + contextual_embeddings, pooled_out, sop_score, mlm_scores =\ + albert_pretrain_model(inputs, token_types, valid_length, masked_positions) + contextual_embeddings_tn, pooled_out_tn, sop_score_tn, mlm_scores_tn = \ + albert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), + contextual_embeddings.asnumpy(), 1E-4, 1E-4) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) + assert_allclose(sop_score.asnumpy(), sop_score_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4) assert mlm_scores.shape == (batch_size, num_mask, cfg.MODEL.vocab_size) assert sop_score.shape == (batch_size, 2) diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py index cb1feedc66..a0d9a8d742 100644 --- a/tests/test_models_bert.py +++ b/tests/test_models_bert.py @@ -1,5 +1,4 @@ import pytest -import numpy as np from numpy.testing import assert_allclose import mxnet as mx import tempfile @@ -12,6 +11,83 @@ def test_list_pretrained_bert(): assert len(list_pretrained_bert()) > 0 +@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) +def test_bert_small_cfg(compute_layout): + cfg = BertModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 100 + cfg.MODEL.units = 12 * 8 + cfg.MODEL.hidden_size = 64 + cfg.MODEL.num_layers = 2 + cfg.MODEL.num_heads = 2 + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() + + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + + # Sample data + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + + # Test for BertModel + bert_model = BertModel.from_cfg(cfg) + bert_model.initialize() + bert_model.hybridize() + contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length) + bert_model_tn = BertModel.from_cfg(cfg_tn) + bert_model_tn.share_parameters(bert_model.collect_params()) + bert_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length) + assert_allclose(contextual_embedding.asnumpy(), + mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + + # Test for BertForMLM + bert_mlm_model = BertForMLM(cfg) + bert_mlm_model.initialize() + bert_mlm_model.hybridize() + contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types, + valid_length, masked_positions) + bert_mlm_model_tn = BertForMLM(cfg_tn) + bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params()) + bert_mlm_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\ + bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) + + # Test for BertForPretrain + bert_pretrain_model = BertForPretrain(cfg) + bert_pretrain_model.initialize() + bert_pretrain_model.hybridize() + contextual_embedding, pooled_out, nsp_score, mlm_scores =\ + bert_pretrain_model(inputs, token_types, valid_length, masked_positions) + bert_pretrain_model_tn = BertForPretrain(cfg_tn) + bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params()) + bert_pretrain_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ + bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) + + @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_bert()) def test_bert_get_pretrained(model_name): diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py index 8866cd7921..17f9420a07 100644 --- a/tests/test_models_electra.py +++ b/tests/test_models_electra.py @@ -3,14 +3,68 @@ from numpy.testing import assert_allclose import mxnet as mx import tempfile -from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator, ElectraGenerator,\ +from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator,\ + ElectraGenerator,\ list_pretrained_electra, get_pretrained_electra, get_generator_cfg mx.npx.set_np() +def test_list_pretrained_electra(): + assert len(list_pretrained_electra()) > 0 + + +def get_test_cfg(): + cfg = ElectraModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 100 + cfg.MODEL.units = 12 * 8 + cfg.MODEL.hidden_size = 128 + cfg.MODEL.num_heads = 2 + cfg.MODEL.num_layers = 2 + cfg.freeze() + return cfg + + +@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN']) +def test_electra_model(compute_layout): + cfg = get_test_cfg() + cfg.defrost() + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() + + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + + # Sample data + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + + electra_model = ElectraModel.from_cfg(cfg) + electra_model.initialize() + electra_model.hybridize() + contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length) + electra_model_tn = ElectraModel.from_cfg(cfg_tn) + electra_model_tn.share_parameters(electra_model.collect_params()) + electra_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), + 1E-4, 1E-4) + + @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_electra()) -def test_bert_get_pretrained(model_name): +def test_electra_get_pretrained(model_name): assert len(list_pretrained_electra()) > 0 with tempfile.TemporaryDirectory() as root: cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\ @@ -34,6 +88,5 @@ def test_bert_get_pretrained(model_name): electra_disc_model.backbone_model.token_pos_embed.collect_params(), electra_disc_model.backbone_model.embed_layer_norm.collect_params()) - electra_gen_model = ElectraGenerator(cfg) electra_gen_model.backbone_model.load_parameters(backbone_params_path) diff --git a/tests/test_models_mobilebert.py b/tests/test_models_mobilebert.py index bfd1e3d882..d7f22ac533 100644 --- a/tests/test_models_mobilebert.py +++ b/tests/test_models_mobilebert.py @@ -12,9 +12,85 @@ def test_list_pretrained_mobilebert(): assert len(list_pretrained_mobilebert()) > 0 +@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT']) +def test_mobilebert_model_small_cfg(compute_layout): + cfg = MobileBertModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 100 + cfg.MODEL.num_layers = 2 + cfg.MODEL.hidden_size = 128 + cfg.MODEL.num_heads = 2 + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() + + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + + mobile_bert_model = MobileBertModel.from_cfg(cfg) + mobile_bert_model.initialize() + mobile_bert_model.hybridize() + mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn) + mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params()) + mobile_bert_model_tn.hybridize() + contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length) + contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T, + token_types.T, valid_length) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + + # Test for MobileBertForMLM + mobile_bert_mlm_model = MobileBertForMLM(cfg) + mobile_bert_mlm_model.initialize() + mobile_bert_mlm_model.hybridize() + mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn) + mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params()) + mobile_bert_model_tn.hybridize() + contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types, + valid_length, + masked_positions) + contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ + mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-4, 1E-4) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4) + + # Test for MobileBertForPretrain + mobile_bert_pretrain_model = MobileBertForPretrain(cfg) + mobile_bert_pretrain_model.initialize() + mobile_bert_pretrain_model.hybridize() + mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn) + mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params()) + mobile_bert_pretrain_model_tn.hybridize() + contextual_embedding, pooled_out, nsp_score, mlm_scores =\ + mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions) + contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ + mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4) + + @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_mobilebert()) -def test_bert_get_pretrained(model_name): +def test_mobilebert_get_pretrained(model_name): with tempfile.TemporaryDirectory() as root: cfg, tokenizer, backbone_params_path, mlm_params_path =\ get_pretrained_mobilebert(model_name, load_backbone=True, load_mlm=True, root=root) diff --git a/tests/test_models_roberta.py b/tests/test_models_roberta.py index 9511c51472..bedf85f027 100644 --- a/tests/test_models_roberta.py +++ b/tests/test_models_roberta.py @@ -2,6 +2,7 @@ import numpy as np import mxnet as mx import tempfile +from numpy.testing import assert_allclose from gluonnlp.models.roberta import RobertaModel, RobertaForMLM, \ list_pretrained_roberta, get_pretrained_roberta from gluonnlp.loss import LabelSmoothCrossEntropyLoss @@ -13,6 +14,59 @@ def test_list_pretrained_roberta(): assert len(list_pretrained_roberta()) > 0 +@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT']) +def test_robert_small_config(compute_layout): + cfg = RobertaModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 1000 + cfg.MODEL.num_layers = 2 + cfg.MODEL.hidden_size = 128 + cfg.MODEL.num_heads = 2 + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() + + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + + roberta_model = RobertaModel.from_cfg(cfg) + roberta_model.initialize() + roberta_model.hybridize() + contextual_embeddings, pooled_out = roberta_model(inputs, valid_length) + roberta_model_tn = RobertaModel.from_cfg(cfg_tn) + roberta_model_tn.share_parameters(roberta_model.collect_params()) + roberta_model_tn.hybridize() + contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length) + assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), + contextual_embeddings.asnumpy(), 1E-4, 1E-4) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) + + # Test for RobertaForMLM + roberta_mlm_model = RobertaForMLM(cfg) + roberta_mlm_model.initialize() + roberta_mlm_model.hybridize() + contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length, + masked_positions) + roberta_mlm_model_tn = RobertaForMLM(cfg_tn) + roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params()) + roberta_mlm_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ + roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions) + assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + contextual_embedding.asnumpy(), 1E-4, 1E-4) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) + assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4) + + @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_roberta()) def test_roberta(model_name): diff --git a/tests/test_models_transformer.py b/tests/test_models_transformer.py index b1e772ce73..e9b1cd6184 100644 --- a/tests/test_models_transformer.py +++ b/tests/test_models_transformer.py @@ -33,6 +33,23 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers): encoded_mem = enc(src_data, src_valid_length) full_decode_out = dec(dst_data, dst_valid_length, encoded_mem, src_valid_length) + # Test for the TN layout + enc_tn = TransformerEncoder(units=units, hidden_size=64, num_layers=num_enc_layers, num_heads=4, + dropout=0.0, pre_norm=pre_norm, layout='TN') + enc_tn.share_parameters(enc.collect_params()) + dec_tn = TransformerDecoder(units=units, hidden_size=64, num_layers=num_dec_layers, num_heads=4, + dropout=0.0, pre_norm=pre_norm, layout='TN') + dec_tn.share_parameters(dec.collect_params()) + enc_tn.hybridize() + dec_tn.hybridize() + encoded_mem_tn = enc_tn(mx.np.swapaxes(src_data, 0, 1), src_valid_length) + full_decode_out_tn = dec_tn(mx.np.swapaxes(dst_data, 0, 1), dst_valid_length, + encoded_mem_tn, src_valid_length) + assert_allclose(encoded_mem_tn.asnumpy(), + mx.np.swapaxes(encoded_mem, 0, 1).asnumpy(), 1E-5, 1E-5) + assert_allclose(full_decode_out_tn.asnumpy(), + mx.np.swapaxes(full_decode_out, 0, 1).asnumpy(), 1E-5, 1E-5) + # Test the consistency via shifting the data and the valid_length for i in range(1, dst_valid_length.asnumpy().min()): for partial_decode_out in [dec(dst_data[:, :(-i), :], @@ -52,11 +69,11 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers): states = dec.layers[0].init_states(batch_size, h_out.ctx, h_out.dtype) h_out_from_incremental = [] for i in range(tgt_seq_length): - ele_h_out, states = dec.layers[0].incremental_decode(mx, dst_data[:, i:(i + 1), :], states, + ele_h_out, states = dec.layers[0].incremental_decode(mx, dst_data[:, i, :], states, encoded_mem, src_valid_length, enc_mem_attn_mask) h_out_from_incremental.append(ele_h_out) - h_out_from_incremental = mx.np.concatenate(h_out_from_incremental, axis=1) + h_out_from_incremental = mx.np.stack(h_out_from_incremental, axis=1) for i in range(batch_size): val_length = dst_valid_length[i].asnumpy() @@ -66,10 +83,10 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers): states = dec.init_states(batch_size, src_data.ctx, src_data.dtype) final_out_from_incremental = [] for i in range(tgt_seq_length): - ele_final_out, states = dec.incremental_decode(mx, dst_data[:, i:(i + 1), :], + ele_final_out, states = dec.incremental_decode(mx, dst_data[:, i, :], states, encoded_mem, src_valid_length) final_out_from_incremental.append(ele_final_out) - final_out_from_incremental = mx.np.concatenate(final_out_from_incremental, axis=1) + final_out_from_incremental = mx.np.stack(final_out_from_incremental, axis=1) for i in range(batch_size): val_length = dst_valid_length[i].asnumpy() assert_allclose(final_out_from_incremental[i, :val_length, :].asnumpy(), @@ -85,12 +102,13 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers): (2, 3, 16, 24)]) @pytest.mark.parametrize('enc_recurrent', [False, True]) @pytest.mark.parametrize('dec_recurrent', [False, True]) -@pytest.mark.parametrize('tie_weights', [False, True]) +@pytest.mark.parametrize('tie_weights,layout', [(False, 'NT'), (True, 'NT'), (True, 'TN')]) def test_transformer_nmt_model(train_hybridize, inference_hybridize, enc_pre_norm, dec_pre_norm, enc_units, dec_units, enc_num_layers, dec_num_layers, - enc_recurrent, dec_recurrent, tie_weights): + enc_recurrent, dec_recurrent, tie_weights, + layout): src_seq_length = 20 tgt_seq_length = 15 src_vocab_size = 32 @@ -117,7 +135,8 @@ def test_transformer_nmt_model(train_hybridize, inference_hybridize, dec_recurrent=dec_recurrent, shared_embed=shared_embed, tie_weights=tie_weights, - dropout=0.0) + dropout=0.0, + layout=layout) inference_model = TransformerNMTInference(model=model) model.initialize() if train_hybridize: @@ -136,10 +155,16 @@ def test_transformer_cfg_registry(): def test_transformer_cfg(cfg_key): cfg = TransformerNMTModel.get_cfg(cfg_key) cfg.defrost() - cfg.MODEL.src_vocab_size = 1000 - cfg.MODEL.tgt_vocab_size = 1000 + cfg.MODEL.src_vocab_size = 32 + cfg.MODEL.tgt_vocab_size = 32 cfg.freeze() model = TransformerNMTModel.from_cfg(cfg) model.initialize() model.hybridize() + cfg.defrost() + cfg.MODEL.layout = 'TN' + cfg.freeze() + model_tn = TransformerNMTModel.from_cfg(cfg) + model_tn.share_parameters(model.collect_params()) + model_tn.hybridize() mx.npx.waitall() diff --git a/tests/test_models_xlmr.py b/tests/test_models_xlmr.py index f8f9ec76fe..ff9c41fdfd 100644 --- a/tests/test_models_xlmr.py +++ b/tests/test_models_xlmr.py @@ -2,7 +2,7 @@ import numpy as np import mxnet as mx import tempfile -from gluonnlp.models.xlmr import XLMRModel, XLMRForMLM, \ +from gluonnlp.models.xlmr import XLMRModel, \ list_pretrained_xlmr, get_pretrained_xlmr from gluonnlp.loss import LabelSmoothCrossEntropyLoss @@ -29,7 +29,7 @@ def test_xlmr(): # test forward batch_size = 1 - seq_length = 8 + seq_length = 4 vocab_size = len(tokenizer.vocab) input_ids = mx.np.array( np.random.randint( From 033214ec7eb7c36006bc3c6846220166b6bb5a00 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Wed, 29 Jul 2020 00:36:57 -0700 Subject: [PATCH 4/4] [Numpy] Fix SQuAD + Fix GLUE downloading (#1280) * Update run_squad.py * Update run_squad.py * Update prepare_glue.py --- .../general_nlp_benchmark/prepare_glue.py | 96 ++++++++++++------- scripts/question_answering/run_squad.py | 4 +- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/scripts/datasets/general_nlp_benchmark/prepare_glue.py b/scripts/datasets/general_nlp_benchmark/prepare_glue.py index e747626db6..bbaf01cf48 100644 --- a/scripts/datasets/general_nlp_benchmark/prepare_glue.py +++ b/scripts/datasets/general_nlp_benchmark/prepare_glue.py @@ -68,7 +68,23 @@ def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False): nrows = len(elements) else: assert nrows == len(elements) - return pd.DataFrame(out, columns=column_names) + df = pd.DataFrame(out, columns=column_names) + series_l = [] + for col_name in df.columns: + idx = df[col_name].first_valid_index() + val = df[col_name][idx] + if isinstance(val, str): + try: + dat = pd.to_numeric(df[col_name]) + series_l.append(dat) + continue + except ValueError: + pass + finally: + pass + series_l.append(df[col_name]) + new_df = pd.DataFrame({name: series for name, series in zip(df.columns, series_l)}) + return new_df def read_jsonl_superglue(jsonl_file): @@ -157,6 +173,13 @@ def read_sts(dir_path): else: df = df[[7, 8, 1, 9]] df.columns = ['sentence1', 'sentence2', 'genre', 'score'] + genre_l = [] + for ele in df['genre'].tolist(): + if ele == 'main-forum': + genre_l.append('main-forums') + else: + genre_l.append(ele) + df['genre'] = pd.Series(genre_l) df_dict[fold] = df return df_dict, None @@ -320,8 +343,8 @@ def read_rte_superglue(dir_path): def read_wic(dir_path): df_dict = dict() meta_data = dict() - meta_data['entities1'] = {'type': 'entity', 'parent': 'sentence1'} - meta_data['entities2'] = {'type': 'entity', 'parent': 'sentence2'} + meta_data['entities1'] = {'type': 'entity', 'attrs': {'parent': 'sentence1'}} + meta_data['entities2'] = {'type': 'entity', 'attrs': {'parent': 'sentence2'}} for fold in ['train', 'val', 'test']: if fold != 'test': @@ -340,13 +363,13 @@ def read_wic(dir_path): end2 = row['end2'] if fold == 'test': out.append([sentence1, sentence2, - (start1, end1), - (start2, end2)]) + {'start': start1, 'end': end1}, + {'start': start2, 'end': end2}]) else: label = row['label'] out.append([sentence1, sentence2, - (start1, end1), - (start2, end2), + {'start': start1, 'end': end1}, + {'start': start2, 'end': end2}, label]) df = pd.DataFrame(out, columns=columns) df_dict[fold] = df @@ -357,8 +380,8 @@ def read_wsc(dir_path): df_dict = dict() tokenizer = WhitespaceTokenizer() meta_data = dict() - meta_data['noun'] = {'type': 'entity', 'parent': 'text'} - meta_data['pronoun'] = {'type': 'entity', 'parent': 'text'} + meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}} + meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}} for fold in ['train', 'val', 'test']: jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold)) df = read_jsonl_superglue(jsonl_path) @@ -374,7 +397,7 @@ def read_wsc(dir_path): span2_text = target['span2_text'] # Build entity # list of entities - # 'entity': {'start': 0, 'end': 100} + # 'entities': {'start': 0, 'end': 100} tokens, offsets = tokenizer.encode_with_offsets(text, str) pos_start1 = offsets[span1_index][0] pos_end1 = pos_start1 + len(span1_text) @@ -382,12 +405,12 @@ def read_wsc(dir_path): pos_end2 = pos_start2 + len(span2_text) if fold == 'test': samples.append({'text': text, - 'noun': (pos_start1, pos_end1), - 'pronoun': (pos_start2, pos_end2)}) + 'noun': {'start': pos_start1, 'end': pos_end1}, + 'pronoun': {'start': pos_start2, 'end': pos_end2}}) else: samples.append({'text': text, - 'noun': (pos_start1, pos_end1), - 'pronoun': (pos_start2, pos_end2), + 'noun': {'start': pos_start1, 'end': pos_end1}, + 'pronoun': {'start': pos_start2, 'end': pos_end2}, 'label': label}) df = pd.DataFrame(samples) df_dict[fold] = df @@ -406,8 +429,8 @@ def read_boolq(dir_path): def read_record(dir_path): df_dict = dict() meta_data = dict() - meta_data['entities'] = {'type': 'entity', 'parent': 'text'} - meta_data['answers'] = {'type': 'entity', 'parent': 'text'} + meta_data['entities'] = {'type': 'entity', 'attrs': {'parent': 'text'}} + meta_data['answers'] = {'type': 'entity', 'attrs': {'parent': 'text'}} for fold in ['train', 'val', 'test']: if fold != 'test': columns = ['source', 'text', 'entities', 'query', 'answers'] @@ -422,15 +445,11 @@ def read_record(dir_path): passage = row['passage'] text = passage['text'] entities = passage['entities'] - entities = [(ele['start'], ele['end']) for ele in entities] + entities = [{'start': ele['start'], 'end': ele['end']} for ele in entities] for qas in row['qas']: query = qas['query'] if fold != 'test': - answer_entities = [] - for answer in qas['answers']: - start = answer['start'] - end = answer['end'] - answer_entities.append((start, end)) + answer_entities = qas['answers'] out.append((source, text, entities, query, answer_entities)) else: out.append((source, text, entities, query)) @@ -518,11 +537,15 @@ def format_mrpc(data_dir): os.makedirs(mrpc_dir, exist_ok=True) mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") - download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file) - download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file) + download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file, + sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']]) + download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file, + sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']]) assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file - download(GLUE_TASK2PATH["mrpc"]['dev'], os.path.join(mrpc_dir, "dev_ids.tsv")) + download(GLUE_TASK2PATH["mrpc"]['dev'], + os.path.join(mrpc_dir, "dev_ids.tsv"), + sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']]) dev_ids = [] with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: @@ -575,7 +598,7 @@ def get_tasks(benchmark, task_names): @DATA_PARSER_REGISTRY.register('prepare_glue') def get_parser(): parser = argparse.ArgumentParser() - parser.add_argument("--benchmark", choices=['glue', 'superglue', 'sts'], + parser.add_argument("--benchmark", choices=['glue', 'superglue'], default='glue', type=str) parser.add_argument("-d", "--data_dir", help="directory to save data to", type=str, default=None) @@ -618,22 +641,24 @@ def main(args): base_dir = os.path.join(args.data_dir, 'rte_diagnostic') os.makedirs(base_dir, exist_ok=True) download(TASK2PATH['diagnostic'][0], - path=os.path.join(base_dir, 'diagnostic.tsv')) + path=os.path.join(base_dir, 'diagnostic.tsv'), + sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]]) download(TASK2PATH['diagnostic'][1], - path=os.path.join(base_dir, 'diagnostic-full.tsv')) + path=os.path.join(base_dir, 'diagnostic-full.tsv'), + sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]]) df = reader(base_dir) - df.to_pickle(os.path.join(base_dir, 'diagnostic-full.pd.pkl')) + df.to_parquet(os.path.join(base_dir, 'diagnostic-full.parquet')) else: for key, name in [('broadcoverage-diagnostic', 'AX-b'), ('winogender-diagnostic', 'AX-g')]: data_file = os.path.join(args.cache_path, "{}.zip".format(key)) url = TASK2PATH[key] reader = TASK2READER[key] - download(url, data_file) + download(url, data_file, sha1_hash=_URL_FILE_STATS[url]) with zipfile.ZipFile(data_file) as zipdata: zipdata.extractall(args.data_dir) df = reader(os.path.join(args.data_dir, name)) - df.to_pickle(os.path.join(args.data_dir, name, '{}.pd.pkl'.format(name))) + df.to_parquet(os.path.join(args.data_dir, name, '{}.parquet'.format(name))) elif task == 'mrpc': reader = TASK2READER[task] format_mrpc(args.data_dir) @@ -641,7 +666,7 @@ def main(args): for key, df in df_dict.items(): if key == 'val': key = 'dev' - df.to_pickle(os.path.join(args.data_dir, 'mrpc', '{}.pd.pkl'.format(key))) + df.to_parquet(os.path.join(args.data_dir, 'mrpc', '{}.parquet'.format(key))) with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'), 'w') as f: json.dump(meta_data, f) else: @@ -649,8 +674,11 @@ def main(args): data_file = os.path.join(args.cache_path, "{}.zip".format(task)) url = TASK2PATH[task] reader = TASK2READER[task] - download(url, data_file) + download(url, data_file, sha1_hash=_URL_FILE_STATS[url]) base_dir = os.path.join(args.data_dir, task) + if os.path.exists(base_dir): + print('Found!') + continue zip_dir_name = None with zipfile.ZipFile(data_file) as zipdata: if zip_dir_name is None: @@ -662,7 +690,7 @@ def main(args): for key, df in df_dict.items(): if key == 'val': key = 'dev' - df.to_pickle(os.path.join(base_dir, '{}.pd.pkl'.format(key))) + df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key))) if meta_data is not None: with open(os.path.join(base_dir, 'metadata.json'), 'w') as f: json.dump(meta_data, f) diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py index 820aec0c46..1484aeccd2 100644 --- a/scripts/question_answering/run_squad.py +++ b/scripts/question_answering/run_squad.py @@ -563,8 +563,8 @@ def train(args): segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None valid_length = sample.valid_length.as_in_ctx(ctx) p_mask = sample.masks.as_in_ctx(ctx) - gt_start = sample.gt_start.as_in_ctx(ctx) - gt_end = sample.gt_end.as_in_ctx(ctx) + gt_start = sample.gt_start.as_in_ctx(ctx).astype(np.int32) + gt_end = sample.gt_end.as_in_ctx(ctx).astype(np.int32) is_impossible = sample.is_impossible.as_in_ctx(ctx).astype(np.int32) batch_idx = mx.np.arange(tokens.shape[0], dtype=np.int32, ctx=ctx) p_mask = 1 - p_mask # In the network, we use 1 --> no_mask, 0 --> mask